[llvm] [LV] Support float and pointer conditional scalar assignments (PR #184101)

Thu Mar 5 03:19:09 PST 2026

https://github.com/sdesmalen-arm updated https://github.com/llvm/llvm-project/pull/184101

>From 03d2eb3716a52a95166767a1672f356129ededcd Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton at arm.com>
Date: Wed, 18 Feb 2026 14:35:31 +0000
Subject: [PATCH 1/9] Support CSA for floating-point & pointer types

Expands support for the `FindLast` Reccurence Kind
to floating-point and pointer types, thereby
enabling conditional scalar assignment for these
types.

As integer and floating-point reccurence kinds are
considered mutually exclusive, a specific
floating-point kind is added (`FFindLast`).
Matching this recurrence kind is gated behind the
recurrence being vectorizable by the target, in
order to avoid regressing in some tests (where a
`FFindLast` is identified and generated, but
cannot currently then be vectorized, resulting in
scalar codegen).
---
 llvm/include/llvm/Analysis/IVDescriptors.h    |   5 +-
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   2 +-
 llvm/lib/Analysis/IVDescriptors.cpp           |  18 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |   1 +
 .../Vectorize/LoopVectorizationLegality.cpp   |   4 +-
 .../Transforms/Vectorize/SLPVectorizer.cpp    |   3 +
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   3 +
 .../CostModel/AArch64/extract-last-active.ll  |  25 +++
 .../AArch64/conditional-scalar-assignment.ll  | 175 ++++++++++++++----
 .../LoopVectorize/AArch64/sve-select-cmp.ll   |  15 +-
 10 files changed, 204 insertions(+), 47 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 592eb2e90c1c6..5495596916b03 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -65,6 +65,9 @@ enum class RecurKind {
   FindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y
                   ///< are an integer type, one is the current recurrence value,
                   ///< and the other is an arbitrary value.
+  FFindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y are
+              ///< a floating-point type, one is the current recurrence value,
+              ///< and the other is an arbitrary value.
   // clang-format on
   // TODO: Any_of and FindLast reduction need not be restricted to integer type
   // only.
@@ -285,7 +288,7 @@ class RecurrenceDescriptor {
   ///   select(cmp(),x,y) where one of (x,y) is an arbitrary value and the
   ///   other is a recurrence.
   static bool isFindLastRecurrenceKind(RecurKind Kind) {
-    return Kind == RecurKind::FindLast;
+    return Kind == RecurKind::FindLast || Kind == RecurKind::FFindLast;
   }
 
   static bool isFindRecurrenceKind(RecurKind Kind) {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index e062b70be6b59..93667cda8aa3f 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1115,7 +1115,7 @@ class TargetTransformInfoImplBase {
 
   virtual bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
                                            ElementCount VF) const {
-    return true;
+    return RdxDesc.getRecurrenceKind() != RecurKind::FFindLast;
   }
 
   virtual bool isElementTypeLegalForScalableVector(Type *Ty) const {
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 349f23bde21d4..d4b61e0562b27 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -55,7 +55,6 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
   case RecurKind::UMin:
   case RecurKind::AnyOf:
   case RecurKind::FindIV:
-  // TODO: Make type-agnostic.
   case RecurKind::FindLast:
     return true;
   }
@@ -510,6 +509,9 @@ bool RecurrenceDescriptor::AddReductionVar(
     if (!isIntegerRecurrenceKind(Kind))
       return false;
     Start = lookThroughAnd(Phi, RecurrenceType, VisitedInsts, CastInsts);
+  } else if (ScalarTy->isPointerTy()) {
+    if (Kind != RecurKind::FindLast)
+      return false;
   } else {
     // Pointer min/max may exist, but it is not supported as a reduction op.
     return false;
@@ -649,7 +651,7 @@ bool RecurrenceDescriptor::AddReductionVar(
         // "FindLast-like" phis are not supported see:
         // IVDescriptorsTest.UnsupportedFindLastPhi.
         FoundFindLastLikePhi =
-            Kind == RecurKind::FindLast && !FoundFindLastLikePhi &&
+            isFindLastRecurrenceKind(Kind) && !FoundFindLastLikePhi &&
             isFindLastLikePhi(cast<PHINode>(Cur), Phi, VisitedInsts);
         if (!FoundFindLastLikePhi)
           return false;
@@ -734,7 +736,7 @@ bool RecurrenceDescriptor::AddReductionVar(
   // We only expect to match a single "find-last-like" phi per find-last
   // reduction, with no non-phi operations in the reduction use chain.
   assert((!FoundFindLastLikePhi ||
-          (Kind == RecurKind::FindLast && NumNonPHIUsers == 0)) &&
+          (isFindLastRecurrenceKind(Kind) && NumNonPHIUsers == 0)) &&
          "Unexpectedly matched a 'find-last-like' phi");
 
   if (isAnyOfRecurrenceKind(Kind) && NumCmpSelectPatternInst != 1)
@@ -936,7 +938,9 @@ RecurrenceDescriptor::isFindPattern(Loop *TheLoop, PHINode *OrigPhi,
                                      m_Value(NonRdxPhi)))))
     return InstDesc(false, I);
 
-  return InstDesc(I, RecurKind::FindLast);
+  return OrigPhi->getType()->isFloatingPointTy()
+             ? InstDesc(I, RecurKind::FFindLast)
+             : InstDesc(I, RecurKind::FindLast);
 }
 
 /// Returns true if the select instruction has users in the compare-and-add
@@ -1119,6 +1123,11 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
     LLVM_DEBUG(dbgs() << "Found a Find reduction PHI." << *Phi << "\n");
     return true;
   }
+  if (AddReductionVar(Phi, RecurKind::FFindLast, TheLoop, FMF, RedDes, DB, AC,
+                      DT, SE)) {
+    LLVM_DEBUG(dbgs() << "Found a float Find reduction PHI." << *Phi << "\n");
+    return true;
+  }
   if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT,
                       SE)) {
     LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
@@ -1260,6 +1269,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
   case RecurKind::FMinimumNum:
     return Instruction::FCmp;
   case RecurKind::FindLast:
+  case RecurKind::FFindLast:
   case RecurKind::AnyOf:
   case RecurKind::FindIV:
     // TODO: Set AnyOf and FindIV to Instruction::Select once in-loop reductions
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b96d251a9c4d9..f8cba1ac567fb 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5562,6 +5562,7 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
   case RecurKind::FMulAdd:
   case RecurKind::AnyOf:
   case RecurKind::FindLast:
+  case RecurKind::FFindLast:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index fec32762db55f..34bf26c4ded01 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -882,7 +882,9 @@ bool LoopVectorizationLegality::canVectorizeInstr(Instruction &I) {
 
     RecurrenceDescriptor RedDes;
     if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, DT,
-                                             PSE.getSE())) {
+                                             PSE.getSE()) &&
+        (RedDes.getRecurrenceKind() != RecurKind::FFindLast || !TTI ||
+         TTI->isLegalToVectorizeReduction(RedDes, ElementCount::getFixed(4)))) {
       Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
       AllowedExit.insert(RedDes.getLoopExitInstr());
       Reductions[Phi] = std::move(RedDes);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3a10d2b2a7158..59645c53d9548 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -26923,6 +26923,7 @@ class HorizontalReduction {
         case RecurKind::AnyOf:
         case RecurKind::FindIV:
         case RecurKind::FindLast:
+        case RecurKind::FFindLast:
         case RecurKind::FMaxNum:
         case RecurKind::FMinNum:
         case RecurKind::FMaximumNum:
@@ -27065,6 +27066,7 @@ class HorizontalReduction {
     case RecurKind::AnyOf:
     case RecurKind::FindIV:
     case RecurKind::FindLast:
+    case RecurKind::FFindLast:
     case RecurKind::FMaxNum:
     case RecurKind::FMinNum:
     case RecurKind::FMaximumNum:
@@ -27169,6 +27171,7 @@ class HorizontalReduction {
     case RecurKind::AnyOf:
     case RecurKind::FindIV:
     case RecurKind::FindLast:
+    case RecurKind::FFindLast:
     case RecurKind::FMaxNum:
     case RecurKind::FMinNum:
     case RecurKind::FMaximumNum:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 4c393f2f6f05a..ed685e99c5a51 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5741,6 +5741,9 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
     if (!SE.isKnownNonZero(Step))
       continue;
 
+    if (IVSCEV->getType()->isPointerTy())
+      continue;
+
     // Positive step means we need UMax/SMax to find the last IV value, and
     // UMin/SMin otherwise.
     bool UseMax = SE.isKnownPositive(Step);
diff --git a/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll b/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll
index 9efcf912076b0..96531a6dada69 100644
--- a/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll
@@ -13,6 +13,7 @@ define void @extractions() {
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2p0 = call ptr @llvm.experimental.vector.extract.last.active.v2p0(<2 x ptr> poison, <2 x i1> poison, ptr poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
@@ -21,6 +22,7 @@ define void @extractions() {
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv2p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv2p0(<vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, ptr poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
@@ -29,6 +31,7 @@ define void @extractions() {
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4p0 = call ptr @llvm.experimental.vector.extract.last.active.v4p0(<4 x ptr> poison, <4 x i1> poison, ptr poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
@@ -37,6 +40,7 @@ define void @extractions() {
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv4p0(<vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, ptr poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
@@ -45,6 +49,7 @@ define void @extractions() {
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1p0 = call ptr @llvm.experimental.vector.extract.last.active.v1p0(<1 x ptr> poison, <1 x i1> poison, ptr poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
@@ -53,6 +58,7 @@ define void @extractions() {
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv1p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv1p0(<vscale x 1 x ptr> poison, <vscale x 1 x i1> poison, ptr poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SVE-LABEL: 'extractions'
@@ -64,6 +70,7 @@ define void @extractions() {
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2p0 = call ptr @llvm.experimental.vector.extract.last.active.v2p0(<2 x ptr> poison, <2 x i1> poison, ptr poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
@@ -72,6 +79,7 @@ define void @extractions() {
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv2p0(<vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, ptr poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
@@ -80,6 +88,7 @@ define void @extractions() {
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4p0 = call ptr @llvm.experimental.vector.extract.last.active.v4p0(<4 x ptr> poison, <4 x i1> poison, ptr poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
@@ -88,6 +97,7 @@ define void @extractions() {
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv4p0(<vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, ptr poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
@@ -96,6 +106,7 @@ define void @extractions() {
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1p0 = call ptr @llvm.experimental.vector.extract.last.active.v1p0(<1 x ptr> poison, <1 x i1> poison, ptr poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
@@ -104,6 +115,7 @@ define void @extractions() {
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv1p0(<vscale x 1 x ptr> poison, <vscale x 1 x i1> poison, ptr poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SME-STREAMING-LABEL: 'extractions'
@@ -115,6 +127,7 @@ define void @extractions() {
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2p0 = call ptr @llvm.experimental.vector.extract.last.active.v2p0(<2 x ptr> poison, <2 x i1> poison, ptr poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
@@ -123,6 +136,7 @@ define void @extractions() {
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv2p0(<vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, ptr poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
@@ -131,6 +145,7 @@ define void @extractions() {
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4p0 = call ptr @llvm.experimental.vector.extract.last.active.v4p0(<4 x ptr> poison, <4 x i1> poison, ptr poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
@@ -139,6 +154,7 @@ define void @extractions() {
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv4p0(<vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, ptr poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
@@ -147,6 +163,7 @@ define void @extractions() {
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1p0 = call ptr @llvm.experimental.vector.extract.last.active.v1p0(<1 x ptr> poison, <1 x i1> poison, ptr poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
@@ -155,6 +172,7 @@ define void @extractions() {
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv1p0(<vscale x 1 x ptr> poison, <vscale x 1 x i1> poison, ptr poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
@@ -167,6 +185,7 @@ define void @extractions() {
   %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
   %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
   %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
+  %v2p0 = call ptr @llvm.experimental.vector.extract.last.active.v2p0(<2 x ptr> poison, <2 x i1> poison, ptr poison)
   %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
   %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
   %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
@@ -175,6 +194,7 @@ define void @extractions() {
   %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
   %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
   %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
+  %nxv2p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv2p0(<vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, ptr poison)
 
   ;; Wider-than-legal
   %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
@@ -185,6 +205,7 @@ define void @extractions() {
   %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
   %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
   %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
+  %v4p0 = call ptr @llvm.experimental.vector.extract.last.active.v4p0(<4 x ptr> poison, <4 x i1> poison, ptr poison)
   %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
   %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
   %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
@@ -193,6 +214,7 @@ define void @extractions() {
   %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
   %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
   %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
+  %nxv4p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv4p0(<vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, ptr poison)
 
   ;; Narrower-than-legal
   %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
@@ -203,6 +225,7 @@ define void @extractions() {
   %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
   %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
   %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
+  %v1p0 = call ptr @llvm.experimental.vector.extract.last.active.v1p0(<1 x ptr> poison, <1 x i1> poison, ptr poison)
   %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
   %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
   %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
@@ -211,6 +234,8 @@ define void @extractions() {
   %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
   %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
   %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
+  %nxv1p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv1p0(<vscale x 1 x ptr> poison, <vscale x 1 x i1> poison, ptr poison)
+
 
   ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
index 7053aa60b2035..221aa7dc6eb82 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
@@ -115,20 +115,57 @@ define ptr @simple_csa_ptr_select(i64 %N, ptr %data, i64 %a, ptr %init) {
 ; SVE-LABEL: define ptr @simple_csa_ptr_select(
 ; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]], ptr [[INIT:%.*]]) #[[ATTR0]] {
 ; SVE-NEXT:  [[ENTRY:.*]]:
+; SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
+; SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; SVE:       [[VECTOR_PH]]:
+; SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 1
+; SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; SVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[A]], i64 0
+; SVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; SVE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[INIT]], i64 0
+; SVE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; SVE-NEXT:    br label %[[LOOP:.*]]
 ; SVE:       [[LOOP]]:
-; SVE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; SVE-NEXT:    [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; SVE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
+; SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x ptr> [ [[BROADCAST_SPLAT2]], %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[LOOP]] ]
+; SVE-NEXT:    [[TMP4:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
 ; SVE-NEXT:    [[LD_ADDR:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[IV]]
-; SVE-NEXT:    [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4
+; SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[LD_ADDR]], align 4
+; SVE-NEXT:    [[TMP6:%.*]] = ptrtoint <vscale x 2 x ptr> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; SVE-NEXT:    [[TMP7:%.*]] = icmp slt <vscale x 2 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
+; SVE-NEXT:    [[TMP8:%.*]] = freeze <vscale x 2 x i1> [[TMP7]]
+; SVE-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP8]])
+; SVE-NEXT:    [[TMP10]] = select i1 [[TMP9]], <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i1> [[TMP4]]
+; SVE-NEXT:    [[TMP11]] = select i1 [[TMP9]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[VEC_PHI]]
+; SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP3]]
+; SVE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SVE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; SVE:       [[MIDDLE_BLOCK]]:
+; SVE-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 2 x ptr> [[BROADCAST_SPLAT2]], i32 0
+; SVE-NEXT:    [[TMP14:%.*]] = call ptr @llvm.experimental.vector.extract.last.active.nxv2p0(<vscale x 2 x ptr> [[TMP11]], <vscale x 2 x i1> [[TMP10]], ptr [[TMP13]])
+; SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; SVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; SVE:       [[SCALAR_PH]]:
+; SVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; SVE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi ptr [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; SVE-NEXT:    br label %[[LOOP1:.*]]
+; SVE:       [[LOOP1]]:
+; SVE-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP1]] ]
+; SVE-NEXT:    [[DATA_PHI:%.*]] = phi ptr [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP1]] ]
+; SVE-NEXT:    [[LD_ADDR1:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[IV1]]
+; SVE-NEXT:    [[LD:%.*]] = load ptr, ptr [[LD_ADDR1]], align 4
 ; SVE-NEXT:    [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64
 ; SVE-NEXT:    [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]]
 ; SVE-NEXT:    [[SELECT_DATA]] = select i1 [[SELECT_CMP]], ptr [[LD]], ptr [[DATA_PHI]]
-; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; SVE-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
 ; SVE:       [[EXIT]]:
-; SVE-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi ptr [ [[SELECT_DATA]], %[[LOOP]] ]
+; SVE-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi ptr [ [[SELECT_DATA]], %[[LOOP1]] ], [ [[TMP14]], %[[MIDDLE_BLOCK]] ]
 ; SVE-NEXT:    ret ptr [[SELECT_DATA_LCSSA]]
 ;
 entry:
@@ -154,37 +191,99 @@ define float @simple_csa_float_select(i64 %N, ptr %data, float %a) {
 ; NEON-LABEL: define float @simple_csa_float_select(
 ; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) {
 ; NEON-NEXT:  [[ENTRY:.*]]:
+; NEON-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 20
+; NEON-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NEON:       [[VECTOR_PH]]:
+; NEON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; NEON-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NEON-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; NEON-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
 ; NEON-NEXT:    br label %[[LOOP:.*]]
 ; NEON:       [[LOOP]]:
-; NEON-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; NEON-NEXT:    [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; NEON-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
+; NEON-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+00), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[LOOP]] ]
+; NEON-NEXT:    [[TMP0:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[LOOP]] ]
 ; NEON-NEXT:    [[LD_ADDR:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
-; NEON-NEXT:    [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4
+; NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[LD_ADDR]], align 4
+; NEON-NEXT:    [[TMP2:%.*]] = fcmp olt <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; NEON-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
+; NEON-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; NEON-NEXT:    [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[TMP0]]
+; NEON-NEXT:    [[TMP6]] = select i1 [[TMP4]], <4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]
+; NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; NEON-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NEON-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; NEON:       [[MIDDLE_BLOCK]]:
+; NEON-NEXT:    [[TMP8:%.*]] = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> [[TMP6]], <4 x i1> [[TMP5]], float -1.000000e+00)
+; NEON-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NEON-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NEON:       [[SCALAR_PH]]:
+; NEON-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; NEON-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+00, %[[ENTRY]] ]
+; NEON-NEXT:    br label %[[LOOP1:.*]]
+; NEON:       [[LOOP1]]:
+; NEON-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP1]] ]
+; NEON-NEXT:    [[DATA_PHI:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP1]] ]
+; NEON-NEXT:    [[LD_ADDR1:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV1]]
+; NEON-NEXT:    [[LD:%.*]] = load float, ptr [[LD_ADDR1]], align 4
 ; NEON-NEXT:    [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]]
 ; NEON-NEXT:    [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]]
-; NEON-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NEON-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NEON-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NEON-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; NEON-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; NEON:       [[EXIT]]:
-; NEON-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP]] ]
+; NEON-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP1]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; NEON-NEXT:    ret float [[SELECT_DATA_LCSSA]]
 ;
 ; SVE-LABEL: define float @simple_csa_float_select(
 ; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) #[[ATTR0]] {
 ; SVE-NEXT:  [[ENTRY:.*]]:
+; SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; SVE:       [[VECTOR_PH]]:
+; SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; SVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[A]], i64 0
+; SVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
 ; SVE-NEXT:    br label %[[LOOP:.*]]
 ; SVE:       [[LOOP]]:
-; SVE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; SVE-NEXT:    [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; SVE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
+; SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ splat (float -1.000000e+00), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; SVE-NEXT:    [[TMP4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[LOOP]] ]
 ; SVE-NEXT:    [[LD_ADDR:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
-; SVE-NEXT:    [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4
+; SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[LD_ADDR]], align 4
+; SVE-NEXT:    [[TMP6:%.*]] = fcmp olt <vscale x 4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; SVE-NEXT:    [[TMP7:%.*]] = freeze <vscale x 4 x i1> [[TMP6]]
+; SVE-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP7]])
+; SVE-NEXT:    [[TMP9]] = select i1 [[TMP8]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP4]]
+; SVE-NEXT:    [[TMP10]] = select i1 [[TMP8]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[VEC_PHI]]
+; SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP3]]
+; SVE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SVE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; SVE:       [[MIDDLE_BLOCK]]:
+; SVE-NEXT:    [[TMP12:%.*]] = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> [[TMP10]], <vscale x 4 x i1> [[TMP9]], float -1.000000e+00)
+; SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; SVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; SVE:       [[SCALAR_PH]]:
+; SVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; SVE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+00, %[[ENTRY]] ]
+; SVE-NEXT:    br label %[[LOOP1:.*]]
+; SVE:       [[LOOP1]]:
+; SVE-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP1]] ]
+; SVE-NEXT:    [[DATA_PHI:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP1]] ]
+; SVE-NEXT:    [[LD_ADDR1:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV1]]
+; SVE-NEXT:    [[LD:%.*]] = load float, ptr [[LD_ADDR1]], align 4
 ; SVE-NEXT:    [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]]
 ; SVE-NEXT:    [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]]
-; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; SVE-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
 ; SVE:       [[EXIT]]:
-; SVE-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP]] ]
+; SVE-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP1]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
 ; SVE-NEXT:    ret float [[SELECT_DATA_LCSSA]]
 ;
 entry:
@@ -498,7 +597,7 @@ define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr rea
 ; NEON-NEXT:    [[TMP11]] = select i1 [[TMP9]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]]
 ; NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; NEON-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NEON-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NEON-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; NEON:       [[MIDDLE_BLOCK]]:
 ; NEON-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP11]], <4 x i1> [[TMP10]], i32 -1)
 ; NEON-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -523,7 +622,7 @@ define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr rea
 ; NEON-NEXT:    [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]]
 ; NEON-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NEON-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NEON-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; NEON-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; NEON:       [[EXIT]]:
 ; NEON-NEXT:    [[SELECT_A_LCSSA:%.*]] = phi i32 [ [[SELECT_A]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
 ; NEON-NEXT:    ret i32 [[SELECT_A_LCSSA]]
@@ -563,7 +662,7 @@ define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr rea
 ; SVE-NEXT:    [[TMP15]] = select i1 [[TMP13]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
 ; SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; SVE-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SVE-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SVE:       [[MIDDLE_BLOCK]]:
 ; SVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x i1> [[TMP14]], i32 -1)
 ; SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -588,7 +687,7 @@ define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr rea
 ; SVE-NEXT:    [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]]
 ; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; SVE-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; SVE:       [[EXIT]]:
 ; SVE-NEXT:    [[SELECT_A_LCSSA:%.*]] = phi i32 [ [[SELECT_A]], %[[LOOP]] ], [ [[TMP17]], %[[MIDDLE_BLOCK]] ]
 ; SVE-NEXT:    ret i32 [[SELECT_A_LCSSA]]
@@ -665,7 +764,7 @@ define i8 @simple_csa_byte_select(i64 %N, ptr %data, i8 %a) {
 ; SVE-NEXT:    [[TMP10]] = select i1 [[TMP8]], <vscale x 16 x i8> [[WIDE_LOAD]], <vscale x 16 x i8> [[VEC_PHI]]
 ; SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; SVE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SVE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; SVE:       [[MIDDLE_BLOCK]]:
 ; SVE-NEXT:    [[TMP12:%.*]] = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> [[TMP10]], <vscale x 16 x i1> [[TMP9]], i8 -1)
 ; SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -683,7 +782,7 @@ define i8 @simple_csa_byte_select(i64 %N, ptr %data, i8 %a) {
 ; SVE-NEXT:    [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i8 [[LD]], i8 [[DATA_PHI]]
 ; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; SVE-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; SVE:       [[EXIT]]:
 ; SVE-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i8 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
 ; SVE-NEXT:    ret i8 [[SELECT_DATA_LCSSA]]
@@ -731,7 +830,7 @@ define i32 @simple_csa_int_select_use_interleave(i64 %N, ptr %data, i32 %a) {
 ; NEON-NEXT:    [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]]
 ; NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; NEON-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NEON-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NEON-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; NEON:       [[MIDDLE_BLOCK]]:
 ; NEON-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 -1)
 ; NEON-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -749,7 +848,7 @@ define i32 @simple_csa_int_select_use_interleave(i64 %N, ptr %data, i32 %a) {
 ; NEON-NEXT:    [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
 ; NEON-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NEON-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NEON-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; NEON-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; NEON:       [[EXIT]]:
 ; NEON-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; NEON-NEXT:    ret i32 [[SELECT_DATA_LCSSA]]
@@ -782,7 +881,7 @@ define i32 @simple_csa_int_select_use_interleave(i64 %N, ptr %data, i32 %a) {
 ; SVE-NEXT:    [[TMP10]] = select i1 [[TMP8]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
 ; SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; SVE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SVE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; SVE:       [[MIDDLE_BLOCK]]:
 ; SVE-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i1> [[TMP9]], i32 -1)
 ; SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -800,7 +899,7 @@ define i32 @simple_csa_int_select_use_interleave(i64 %N, ptr %data, i32 %a) {
 ; SVE-NEXT:    [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
 ; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; SVE-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; SVE:       [[EXIT]]:
 ; SVE-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
 ; SVE-NEXT:    ret i32 [[SELECT_DATA_LCSSA]]
@@ -888,7 +987,7 @@ define i32 @simple_csa_int_load(ptr noalias %a, ptr noalias %b, i32 %default_val
 ; SVE-NEXT:    [[TMP11]] = select i1 [[TMP9]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
 ; SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; SVE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SVE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; SVE:       [[MIDDLE_BLOCK]]:
 ; SVE-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i32> [[BROADCAST_SPLAT2]], i32 0
 ; SVE-NEXT:    [[TMP14:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i1> [[TMP10]], i32 [[TMP13]])
@@ -913,7 +1012,7 @@ define i32 @simple_csa_int_load(ptr noalias %a, ptr noalias %b, i32 %default_val
 ; SVE-NEXT:    [[SELECT_DATA]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
 ; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; SVE-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; SVE:       [[EXIT]]:
 ; SVE-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ], [ [[TMP14]], %[[MIDDLE_BLOCK]] ]
 ; SVE-NEXT:    ret i32 [[SELECT_DATA_LCSSA]]
@@ -1008,7 +1107,7 @@ define i32 @simple_csa_int_divide(ptr noalias %a, ptr noalias %b, i32 %default_v
 ; SVE-NEXT:    [[TMP12]] = select i1 [[TMP10]], <vscale x 4 x i32> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]]
 ; SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; SVE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; SVE-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; SVE:       [[MIDDLE_BLOCK]]:
 ; SVE-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i32> [[BROADCAST_SPLAT2]], i32 0
 ; SVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i1> [[TMP11]], i32 [[TMP14]])
@@ -1032,7 +1131,7 @@ define i32 @simple_csa_int_divide(ptr noalias %a, ptr noalias %b, i32 %default_v
 ; SVE-NEXT:    [[SELECT_DATA]] = phi i32 [ [[DIV]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
 ; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; SVE-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; SVE:       [[EXIT]]:
 ; SVE-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ]
 ; SVE-NEXT:    ret i32 [[SELECT_DATA_LCSSA]]
@@ -1138,7 +1237,7 @@ define i32 @csa_load_nested_ifs(ptr noalias %a, ptr noalias %b, i32 %default_val
 ; SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; SVE-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT4]]
 ; SVE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; SVE-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; SVE:       [[MIDDLE_BLOCK]]:
 ; SVE-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 4 x i32> [[BROADCAST_SPLAT2]], i32 0
 ; SVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x i1> [[TMP13]], i32 [[TMP16]])
@@ -1166,7 +1265,7 @@ define i32 @csa_load_nested_ifs(ptr noalias %a, ptr noalias %b, i32 %default_val
 ; SVE-NEXT:    [[SELECT_DATA]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ], [ [[DATA_PHI]], %[[OUTER_IF_THEN]] ]
 ; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; SVE-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
 ; SVE:       [[EXIT]]:
 ; SVE-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ], [ [[TMP17]], %[[MIDDLE_BLOCK]] ]
 ; SVE-NEXT:    ret i32 [[SELECT_DATA_LCSSA]]
@@ -1666,7 +1765,7 @@ define i32 @csa_find_last_phi_use_iv(ptr noalias %mask, ptr noalias %src, ptr no
 ; SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; SVE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SVE-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; SVE-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; SVE:       [[MIDDLE_BLOCK]]:
 ; SVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv16i32(<vscale x 16 x i32> [[TMP15]], <vscale x 16 x i1> [[TMP14]], i32 0)
 ; SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -1693,7 +1792,7 @@ define i32 @csa_find_last_phi_use_iv(ptr noalias %mask, ptr noalias %src, ptr no
 ; SVE-NEXT:    [[LASTSUM_1]] = phi i32 [ [[ADD_IV]], %[[COND_ASSIGN]] ], [ [[LASTSUM_0]], %[[LOOP]] ]
 ; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; SVE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; SVE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
 ; SVE:       [[EXIT]]:
 ; SVE-NEXT:    [[LASTSUM_1_LCSSA:%.*]] = phi i32 [ [[LASTSUM_1]], %[[LATCH]] ], [ [[TMP17]], %[[MIDDLE_BLOCK]] ]
 ; SVE-NEXT:    ret i32 [[LASTSUM_1_LCSSA]]
@@ -1806,7 +1905,7 @@ define i32 @csa_multiple_find_last_phi_reductions(ptr noalias %mask, ptr noalias
 ; SVE-NEXT:    [[TMP17]] = select i1 [[TMP13]], <vscale x 16 x i32> [[TMP11]], <vscale x 16 x i32> [[VEC_PHI1]]
 ; SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; SVE-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; SVE-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; SVE:       [[MIDDLE_BLOCK]]:
 ; SVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv16i32(<vscale x 16 x i32> [[TMP15]], <vscale x 16 x i1> [[TMP14]], i32 0)
 ; SVE-NEXT:    [[TMP20:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv16i32(<vscale x 16 x i32> [[TMP17]], <vscale x 16 x i1> [[TMP16]], i32 0)
@@ -1838,7 +1937,7 @@ define i32 @csa_multiple_find_last_phi_reductions(ptr noalias %mask, ptr noalias
 ; SVE-NEXT:    [[LAST_B_1]] = phi i32 [ [[B_ZEXT]], %[[COND_ASSIGN]] ], [ [[LAST_B_0]], %[[LOOP]] ]
 ; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; SVE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; SVE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
 ; SVE:       [[EXIT]]:
 ; SVE-NEXT:    [[LAST_A_1_LCSSA:%.*]] = phi i32 [ [[LAST_A_1]], %[[LATCH]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ]
 ; SVE-NEXT:    [[LAST_B_1_LCSSA:%.*]] = phi i32 [ [[LAST_B_1]], %[[LATCH]] ], [ [[TMP20]], %[[MIDDLE_BLOCK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
index c348decfc88bf..1165d88312817 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
@@ -129,9 +129,20 @@ exit:                                     ; preds = %for.body
 
 define float @select_const_f32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 {
 ; CHECK-VF4IC1-LABEL: @select_const_f32_from_icmp
-; CHECK-VF4IC1-NOT: vector.body
+; CHECK-VF4IC1:      vector.body:
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ splat (float 3.000000e+00), %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
+; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_LOAD]], splat (i32 3)
+; CHECK-VF4IC1-NEXT:   [[VEC_ICMP_F:%.*]] = freeze <vscale x 4 x i1> [[VEC_ICMP]]
+; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_ICMP_F]])
+; CHECK-VF4IC1-NEXT:    [[VEC_SEL1:%.*]] = select i1 [[TMP8]], <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i1> [[TMP4:%.*]]
+; CHECK-VF4IC1-NEXT:    [[VEC_SEL2:%.*]] = select i1 [[TMP8]], <vscale x 4 x float> splat (float 7.000000e+00), <vscale x 4 x float> [[VEC_PHI]]
+; CHECK-VF4IC1:      middle.block:
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> [[VEC_SEL2]], <vscale x 4 x i1> [[VEC_SEL1]], float 3.000000e+00)
+; CHECK-VF4IC1:        %cmp.n = icmp eq i64 %n, %n.vec
+
 ; CHECK-VF4IC4-LABEL: @select_const_f32_from_icmp
-; CHECK-VF4IC4-NOT: vector.body
+; CHECK-VF4IC4:      vector.body:
 entry:
   br label %for.body
 

>From 2811a9c33c71e7fa24a57918d06cdd3d5f067329 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton at arm.com>
Date: Wed, 25 Feb 2026 17:36:19 +0000
Subject: [PATCH 2/9] Remove unnecessary conditions, update X86/RISCV tests

---
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   2 +-
 .../Vectorize/LoopVectorizationLegality.cpp   |   4 +-
 .../RISCV/select-cmp-reduction.ll             | 293 ++++++++++--------
 .../X86/conditional-scalar-assignment.ll      | 151 +++++++--
 .../Transforms/LoopVectorize/iv-select-cmp.ll | 132 ++++++--
 .../Transforms/LoopVectorize/select-cmp.ll    | 136 ++++++--
 6 files changed, 502 insertions(+), 216 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 93667cda8aa3f..e062b70be6b59 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1115,7 +1115,7 @@ class TargetTransformInfoImplBase {
 
   virtual bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
                                            ElementCount VF) const {
-    return RdxDesc.getRecurrenceKind() != RecurKind::FFindLast;
+    return true;
   }
 
   virtual bool isElementTypeLegalForScalableVector(Type *Ty) const {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 34bf26c4ded01..fec32762db55f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -882,9 +882,7 @@ bool LoopVectorizationLegality::canVectorizeInstr(Instruction &I) {
 
     RecurrenceDescriptor RedDes;
     if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, DT,
-                                             PSE.getSE()) &&
-        (RedDes.getRecurrenceKind() != RecurKind::FFindLast || !TTI ||
-         TTI->isLegalToVectorizeReduction(RedDes, ElementCount::getFixed(4)))) {
+                                             PSE.getSE())) {
       Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
       AllowedExit.insert(RedDes.getLoopExitInstr());
       Reductions[Phi] = std::move(RedDes);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
index 8971b0cadfa48..fd118aaf81a02 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
@@ -4,32 +4,32 @@
 define i32 @select_icmp(i32 %x, i32 %y, ptr nocapture readonly %c, i64 %n) {
 ; CHECK-LABEL: define i32 @select_icmp(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr readonly captures(none) [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:  [[VECTOR_PH:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY1:.*]]
+; CHECK:       [[VECTOR_BODY1]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_BODY]] ], [ [[CURRENT_ITERATION_NEXT:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_BODY]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp sge <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP8]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP14]])
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP14]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP8]])
-; CHECK-NEXT:    [[TMP11:%.*]] = freeze i1 [[TMP10]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 [[Y]], i32 0
-; CHECK-NEXT:    br label %[[FOR_END:.*]]
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sge <vscale x 4 x i32> [[VP_OP_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[CURRENT_ITERATION_NEXT]] = add i64 [[TMP4]], [[INDEX]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[FOR_END:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP7:%.*]] = freeze i1 [[TMP8]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 [[Y]], i32 0
+; CHECK-NEXT:    br label %[[FOR_END1:.*]]
+; CHECK:       [[FOR_END1]]:
 ; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
@@ -53,32 +53,32 @@ for.end:
 define i32 @select_fcmp(float %x, i32 %y, ptr nocapture readonly %c, i64 %n) {
 ; CHECK-LABEL: define i32 @select_fcmp(
 ; CHECK-SAME: float [[X:%.*]], i32 [[Y:%.*]], ptr readonly captures(none) [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[X]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:  [[VECTOR_PH:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY1:.*]]
+; CHECK:       [[VECTOR_BODY1]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_BODY]] ], [ [[CURRENT_ITERATION_NEXT:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_BODY]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
-; CHECK-NEXT:    [[TMP7:%.*]] = fcmp fast uge <vscale x 4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP8]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP14]])
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP14]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP8]])
-; CHECK-NEXT:    [[TMP11:%.*]] = freeze i1 [[TMP10]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 [[Y]], i32 0
-; CHECK-NEXT:    br label %[[FOR_END:.*]]
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp fast uge <vscale x 4 x float> [[VP_OP_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[CURRENT_ITERATION_NEXT]] = add i64 [[TMP4]], [[INDEX]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[FOR_END:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP7:%.*]] = freeze i1 [[TMP8]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 [[Y]], i32 0
+; CHECK-NEXT:    br label %[[FOR_END1:.*]]
+; CHECK:       [[FOR_END1]]:
 ; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
@@ -102,30 +102,30 @@ for.end:
 define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-LABEL: define i32 @select_const_i32_from_icmp(
 ; CHECK-SAME: ptr readonly captures(none) [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:  [[VECTOR_PH:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    br label %[[VECTOR_BODY1:.*]]
+; CHECK:       [[VECTOR_BODY1]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_BODY]] ], [ [[CURRENT_ITERATION_NEXT:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_BODY]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP21]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 3)
-; CHECK-NEXT:    [[TMP8]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP21]])
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP21]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP8]])
-; CHECK-NEXT:    [[TMP11:%.*]] = freeze i1 [[TMP10]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 7, i32 3
-; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
+; CHECK-NEXT:    [[TMP3]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[CURRENT_ITERATION_NEXT]] = add i64 [[TMP4]], [[INDEX]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[EXIT:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP7:%.*]] = freeze i1 [[TMP8]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 7, i32 3
+; CHECK-NEXT:    br label %[[EXIT1:.*]]
+; CHECK:       [[EXIT1]]:
 ; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
@@ -149,30 +149,30 @@ exit:                                     ; preds = %for.body
 define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) {
 ; CHECK-LABEL: define i32 @select_i32_from_icmp(
 ; CHECK-SAME: ptr readonly captures(none) [[V:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:  [[VECTOR_PH:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    br label %[[VECTOR_BODY1:.*]]
+; CHECK:       [[VECTOR_BODY1]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_BODY]] ], [ [[CURRENT_ITERATION_NEXT:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_BODY]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP21]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 3)
-; CHECK-NEXT:    [[TMP8]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP21]])
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP21]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP8]])
-; CHECK-NEXT:    [[TMP11:%.*]] = freeze i1 [[TMP10]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 [[B]], i32 [[A]]
-; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
+; CHECK-NEXT:    [[TMP3]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[CURRENT_ITERATION_NEXT]] = add i64 [[TMP4]], [[INDEX]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[EXIT:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP7:%.*]] = freeze i1 [[TMP8]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 [[B]], i32 [[A]]
+; CHECK-NEXT:    br label %[[EXIT1:.*]]
+; CHECK:       [[EXIT1]]:
 ; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
@@ -196,30 +196,30 @@ exit:                                     ; preds = %for.body
 define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-LABEL: define i32 @select_const_i32_from_fcmp(
 ; CHECK-SAME: ptr readonly captures(none) [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:  [[VECTOR_PH:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    br label %[[VECTOR_BODY1:.*]]
+; CHECK:       [[VECTOR_BODY1]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_BODY]] ], [ [[CURRENT_ITERATION_NEXT:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_BODY]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP21]])
-; CHECK-NEXT:    [[TMP7:%.*]] = fcmp fast one <vscale x 4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
-; CHECK-NEXT:    [[TMP8]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP21]])
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP21]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP8]])
-; CHECK-NEXT:    [[TMP11:%.*]] = freeze i1 [[TMP10]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 1, i32 2
-; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp fast one <vscale x 4 x float> [[VP_OP_LOAD]], splat (float 3.000000e+00)
+; CHECK-NEXT:    [[TMP3]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[CURRENT_ITERATION_NEXT]] = add i64 [[TMP4]], [[INDEX]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[EXIT:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP7:%.*]] = freeze i1 [[TMP8]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 1, i32 2
+; CHECK-NEXT:    br label %[[EXIT1:.*]]
+; CHECK:       [[EXIT1]]:
 ; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
@@ -244,19 +244,46 @@ define float @select_const_f32_from_icmp(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-LABEL: define float @select_const_f32_from_icmp(
 ; CHECK-SAME: ptr readonly captures(none) [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[TMP6:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi fast float [ 3.000000e+00, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ splat (float 3.000000e+00), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <8 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <8 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-NEXT:    [[TMP15:%.*]] = freeze <8 x i1> [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP13]] = select i1 [[TMP16]], <8 x i1> [[TMP14]], <8 x i1> [[TMP12]]
+; CHECK-NEXT:    [[TMP10]] = select i1 [[TMP16]], <8 x float> splat (float 7.000000e+00), <8 x float> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> [[TMP10]], <8 x i1> [[TMP13]], float 3.000000e+00)
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 3.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY1:.*]]
+; CHECK:       [[FOR_BODY1]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[TMP6:%.*]], %[[FOR_BODY1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi fast float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[TMP5:%.*]], %[[FOR_BODY1]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 3
 ; CHECK-NEXT:    [[TMP5]] = select fast i1 [[TMP4]], float [[TMP1]], float 7.000000e+00
-; CHECK-NEXT:    [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP6]] = add nuw nsw i64 [[TMP9]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP6]], [[N]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[EXIT]], label %[[FOR_BODY1]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi float [ [[TMP5]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi float [ [[TMP5]], %[[FOR_BODY1]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[DOTLCSSA]]
 ;
 entry:
@@ -280,33 +307,33 @@ exit:                                     ; preds = %for.body
 define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i64 %n) {
 ; CHECK-LABEL: define i32 @pred_select_const_i32_from_icmp(
 ; CHECK-SAME: ptr noalias readonly captures(none) [[SRC1:%.*]], ptr noalias readonly captures(none) [[SRC2:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:  [[VECTOR_PH:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    br label %[[VECTOR_BODY1:.*]]
+; CHECK:       [[VECTOR_BODY1]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_BODY]] ], [ [[CURRENT_ITERATION_NEXT:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_BODY]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP17]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 35)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> [[TMP7]], i32 [[TMP17]])
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 2)
-; CHECK-NEXT:    [[TMP10:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP9]]
-; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[VEC_PHI]]
-; CHECK-NEXT:    [[PREDPHI]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[PREDPHI1]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP17]])
-; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP17]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[INDEX]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP21]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 35)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP3]], <vscale x 4 x i1> [[TMP2]], i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <vscale x 4 x i32> [[VP_OP_LOAD1]], splat (i32 2)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP4]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i1> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP11]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[PREDPHI]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[CURRENT_ITERATION_NEXT]] = add i64 [[TMP7]], [[INDEX]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[PREDPHI]])
-; CHECK-NEXT:    [[TMP13:%.*]] = freeze i1 [[TMP12]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 1, i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP11]])
+; CHECK-NEXT:    [[TMP10:%.*]] = freeze i1 [[TMP9]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP10]], i32 1, i32 0
 ; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT:.*]]
 ; CHECK:       [[FOR_END_LOOPEXIT]]:
 ; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/X86/conditional-scalar-assignment.ll
index 3d0905081e2a5..5aae5d4929d91 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/conditional-scalar-assignment.ll
@@ -141,20 +141,53 @@ define ptr @simple_csa_ptr_select(i64 %N, ptr %data, i64 %a, ptr %init) {
 ; AVX512-LABEL: define ptr @simple_csa_ptr_select(
 ; AVX512-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]], ptr [[INIT:%.*]]) #[[ATTR0]] {
 ; AVX512-NEXT:  [[ENTRY:.*]]:
+; AVX512-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 24
+; AVX512-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; AVX512:       [[VECTOR_PH]]:
+; AVX512-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; AVX512-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; AVX512-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[A]], i64 0
+; AVX512-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; AVX512-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x ptr> poison, ptr [[INIT]], i64 0
+; AVX512-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT1]], <8 x ptr> poison, <8 x i32> zeroinitializer
 ; AVX512-NEXT:    br label %[[LOOP:.*]]
 ; AVX512:       [[LOOP]]:
-; AVX512-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; AVX512-NEXT:    [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; AVX512-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
+; AVX512-NEXT:    [[VEC_PHI:%.*]] = phi <8 x ptr> [ [[BROADCAST_SPLAT2]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[LOOP]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = phi <8 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[LOOP]] ]
 ; AVX512-NEXT:    [[LD_ADDR:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[IV]]
-; AVX512-NEXT:    [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x ptr>, ptr [[LD_ADDR]], align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = ptrtoint <8 x ptr> [[WIDE_LOAD]] to <8 x i64>
+; AVX512-NEXT:    [[TMP3:%.*]] = icmp slt <8 x i64> [[BROADCAST_SPLAT]], [[TMP2]]
+; AVX512-NEXT:    [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
+; AVX512-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP4]])
+; AVX512-NEXT:    [[TMP6]] = select i1 [[TMP5]], <8 x i1> [[TMP3]], <8 x i1> [[TMP0]]
+; AVX512-NEXT:    [[TMP7]] = select i1 [[TMP5]], <8 x ptr> [[WIDE_LOAD]], <8 x ptr> [[VEC_PHI]]
+; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; AVX512-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX512-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; AVX512:       [[MIDDLE_BLOCK]]:
+; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <8 x ptr> [[BROADCAST_SPLAT2]], i32 0
+; AVX512-NEXT:    [[TMP10:%.*]] = call ptr @llvm.experimental.vector.extract.last.active.v8p0(<8 x ptr> [[TMP7]], <8 x i1> [[TMP6]], ptr [[TMP9]])
+; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; AVX512-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; AVX512:       [[SCALAR_PH]]:
+; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; AVX512-NEXT:    [[BC_MERGE_RDX:%.*]] = phi ptr [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; AVX512-NEXT:    br label %[[LOOP1:.*]]
+; AVX512:       [[LOOP1]]:
+; AVX512-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP1]] ]
+; AVX512-NEXT:    [[DATA_PHI:%.*]] = phi ptr [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP1]] ]
+; AVX512-NEXT:    [[LD_ADDR1:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[IV1]]
+; AVX512-NEXT:    [[LD:%.*]] = load ptr, ptr [[LD_ADDR1]], align 4
 ; AVX512-NEXT:    [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64
 ; AVX512-NEXT:    [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]]
 ; AVX512-NEXT:    [[SELECT_DATA]] = select i1 [[SELECT_CMP]], ptr [[LD]], ptr [[DATA_PHI]]
-; AVX512-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; AVX512-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; AVX512-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; AVX512-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; AVX512-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
 ; AVX512:       [[EXIT]]:
-; AVX512-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi ptr [ [[SELECT_DATA]], %[[LOOP]] ]
+; AVX512-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi ptr [ [[SELECT_DATA]], %[[LOOP1]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
 ; AVX512-NEXT:    ret ptr [[SELECT_DATA_LCSSA]]
 ;
 entry:
@@ -180,37 +213,95 @@ define float @simple_csa_float_select(i64 %N, ptr %data, float %a) {
 ; X86-LABEL: define float @simple_csa_float_select(
 ; X86-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) {
 ; X86-NEXT:  [[ENTRY:.*]]:
+; X86-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; X86-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; X86:       [[VECTOR_PH]]:
+; X86-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; X86-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; X86-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; X86-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
 ; X86-NEXT:    br label %[[LOOP:.*]]
 ; X86:       [[LOOP]]:
-; X86-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; X86-NEXT:    [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; X86-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
+; X86-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+00), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[LOOP]] ]
+; X86-NEXT:    [[TMP0:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[LOOP]] ]
 ; X86-NEXT:    [[LD_ADDR:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
-; X86-NEXT:    [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4
+; X86-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[LD_ADDR]], align 4
+; X86-NEXT:    [[TMP2:%.*]] = fcmp olt <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; X86-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; X86-NEXT:    [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[TMP0]]
+; X86-NEXT:    [[TMP6]] = select i1 [[TMP4]], <4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]
+; X86-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; X86-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; X86:       [[MIDDLE_BLOCK]]:
+; X86-NEXT:    [[TMP8:%.*]] = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> [[TMP6]], <4 x i1> [[TMP5]], float -1.000000e+00)
+; X86-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; X86-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; X86:       [[SCALAR_PH]]:
+; X86-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; X86-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+00, %[[ENTRY]] ]
+; X86-NEXT:    br label %[[LOOP1:.*]]
+; X86:       [[LOOP1]]:
+; X86-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP1]] ]
+; X86-NEXT:    [[DATA_PHI:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP1]] ]
+; X86-NEXT:    [[LD_ADDR1:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV1]]
+; X86-NEXT:    [[LD:%.*]] = load float, ptr [[LD_ADDR1]], align 4
 ; X86-NEXT:    [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]]
 ; X86-NEXT:    [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]]
-; X86-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; X86-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; X86-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; X86-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; X86-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
 ; X86:       [[EXIT]]:
-; X86-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP]] ]
+; X86-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP1]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; X86-NEXT:    ret float [[SELECT_DATA_LCSSA]]
 ;
 ; AVX512-LABEL: define float @simple_csa_float_select(
 ; AVX512-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) #[[ATTR0]] {
 ; AVX512-NEXT:  [[ENTRY:.*]]:
+; AVX512-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32
+; AVX512-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; AVX512:       [[VECTOR_PH]]:
+; AVX512-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; AVX512-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; AVX512-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x float> poison, float [[A]], i64 0
+; AVX512-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x float> [[BROADCAST_SPLATINSERT]], <16 x float> poison, <16 x i32> zeroinitializer
 ; AVX512-NEXT:    br label %[[LOOP:.*]]
 ; AVX512:       [[LOOP]]:
-; AVX512-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; AVX512-NEXT:    [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; AVX512-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
+; AVX512-NEXT:    [[VEC_PHI:%.*]] = phi <16 x float> [ splat (float -1.000000e+00), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[LOOP]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = phi <16 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[LOOP]] ]
 ; AVX512-NEXT:    [[LD_ADDR:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
-; AVX512-NEXT:    [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x float>, ptr [[LD_ADDR]], align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = fcmp olt <16 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; AVX512-NEXT:    [[TMP3:%.*]] = freeze <16 x i1> [[TMP2]]
+; AVX512-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
+; AVX512-NEXT:    [[TMP5]] = select i1 [[TMP4]], <16 x i1> [[TMP2]], <16 x i1> [[TMP0]]
+; AVX512-NEXT:    [[TMP6]] = select i1 [[TMP4]], <16 x float> [[WIDE_LOAD]], <16 x float> [[VEC_PHI]]
+; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 16
+; AVX512-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX512-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; AVX512:       [[MIDDLE_BLOCK]]:
+; AVX512-NEXT:    [[TMP8:%.*]] = call float @llvm.experimental.vector.extract.last.active.v16f32(<16 x float> [[TMP6]], <16 x i1> [[TMP5]], float -1.000000e+00)
+; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; AVX512-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; AVX512:       [[SCALAR_PH]]:
+; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; AVX512-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+00, %[[ENTRY]] ]
+; AVX512-NEXT:    br label %[[LOOP1:.*]]
+; AVX512:       [[LOOP1]]:
+; AVX512-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP1]] ]
+; AVX512-NEXT:    [[DATA_PHI:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP1]] ]
+; AVX512-NEXT:    [[LD_ADDR1:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV1]]
+; AVX512-NEXT:    [[LD:%.*]] = load float, ptr [[LD_ADDR1]], align 4
 ; AVX512-NEXT:    [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]]
 ; AVX512-NEXT:    [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]]
-; AVX512-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; AVX512-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; AVX512-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; AVX512-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; AVX512-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
 ; AVX512:       [[EXIT]]:
-; AVX512-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP]] ]
+; AVX512-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP1]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; AVX512-NEXT:    ret float [[SELECT_DATA_LCSSA]]
 ;
 entry:
@@ -524,7 +615,7 @@ define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr rea
 ; X86-NEXT:    [[TMP11]] = select i1 [[TMP9]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]]
 ; X86-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; X86-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; X86-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; X86-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; X86:       [[MIDDLE_BLOCK]]:
 ; X86-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP11]], <4 x i1> [[TMP10]], i32 -1)
 ; X86-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -549,7 +640,7 @@ define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr rea
 ; X86-NEXT:    [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]]
 ; X86-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; X86-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; X86-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
+; X86-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
 ; X86:       [[EXIT]]:
 ; X86-NEXT:    [[SELECT_A_LCSSA:%.*]] = phi i32 [ [[SELECT_A]], %[[LOOP1]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
 ; X86-NEXT:    ret i32 [[SELECT_A_LCSSA]]
@@ -585,7 +676,7 @@ define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr rea
 ; AVX512-NEXT:    [[TMP11]] = select i1 [[TMP9]], <16 x i32> [[WIDE_LOAD]], <16 x i32> [[VEC_PHI]]
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX512-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX512-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; AVX512-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; AVX512:       [[MIDDLE_BLOCK]]:
 ; AVX512-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v16i32(<16 x i32> [[TMP11]], <16 x i1> [[TMP10]], i32 -1)
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -610,7 +701,7 @@ define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr rea
 ; AVX512-NEXT:    [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]]
 ; AVX512-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; AVX512-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; AVX512-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; AVX512-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; AVX512:       [[EXIT]]:
 ; AVX512-NEXT:    [[SELECT_A_LCSSA:%.*]] = phi i32 [ [[SELECT_A]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
 ; AVX512-NEXT:    ret i32 [[SELECT_A_LCSSA]]
@@ -665,7 +756,7 @@ define i8 @simple_csa_byte_select(i64 %N, ptr %data, i8 %a) {
 ; X86-NEXT:    [[TMP6]] = select i1 [[TMP4]], <16 x i8> [[WIDE_LOAD]], <16 x i8> [[VEC_PHI]]
 ; X86-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 16
 ; X86-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; X86-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; X86-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; X86:       [[MIDDLE_BLOCK]]:
 ; X86-NEXT:    [[TMP8:%.*]] = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> [[TMP6]], <16 x i1> [[TMP5]], i8 -1)
 ; X86-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -683,7 +774,7 @@ define i8 @simple_csa_byte_select(i64 %N, ptr %data, i8 %a) {
 ; X86-NEXT:    [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i8 [[LD]], i8 [[DATA_PHI]]
 ; X86-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; X86-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; X86-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
+; X86-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
 ; X86:       [[EXIT]]:
 ; X86-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i8 [ [[SELECT_DATA]], %[[LOOP1]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; X86-NEXT:    ret i8 [[SELECT_DATA_LCSSA]]
@@ -712,7 +803,7 @@ define i8 @simple_csa_byte_select(i64 %N, ptr %data, i8 %a) {
 ; AVX512-NEXT:    [[TMP6]] = select i1 [[TMP4]], <64 x i8> [[WIDE_LOAD]], <64 x i8> [[VEC_PHI]]
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; AVX512-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX512-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; AVX512-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; AVX512:       [[MIDDLE_BLOCK]]:
 ; AVX512-NEXT:    [[TMP8:%.*]] = call i8 @llvm.experimental.vector.extract.last.active.v64i8(<64 x i8> [[TMP6]], <64 x i1> [[TMP5]], i8 -1)
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -730,7 +821,7 @@ define i8 @simple_csa_byte_select(i64 %N, ptr %data, i8 %a) {
 ; AVX512-NEXT:    [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i8 [[LD]], i8 [[DATA_PHI]]
 ; AVX512-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; AVX512-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; AVX512-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; AVX512-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; AVX512:       [[EXIT]]:
 ; AVX512-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i8 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; AVX512-NEXT:    ret i8 [[SELECT_DATA_LCSSA]]
@@ -778,7 +869,7 @@ define i32 @simple_csa_int_select_use_interleave(i64 %N, ptr %data, i32 %a) {
 ; X86-NEXT:    [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]]
 ; X86-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; X86-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; X86-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; X86-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
 ; X86:       [[MIDDLE_BLOCK]]:
 ; X86-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 -1)
 ; X86-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -796,7 +887,7 @@ define i32 @simple_csa_int_select_use_interleave(i64 %N, ptr %data, i32 %a) {
 ; X86-NEXT:    [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
 ; X86-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; X86-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; X86-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
+; X86-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]]
 ; X86:       [[EXIT]]:
 ; X86-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP1]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; X86-NEXT:    ret i32 [[SELECT_DATA_LCSSA]]
@@ -825,7 +916,7 @@ define i32 @simple_csa_int_select_use_interleave(i64 %N, ptr %data, i32 %a) {
 ; AVX512-NEXT:    [[TMP6]] = select i1 [[TMP4]], <16 x i32> [[WIDE_LOAD]], <16 x i32> [[VEC_PHI]]
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX512-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX512-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; AVX512-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; AVX512:       [[MIDDLE_BLOCK]]:
 ; AVX512-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v16i32(<16 x i32> [[TMP6]], <16 x i1> [[TMP5]], i32 -1)
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -843,7 +934,7 @@ define i32 @simple_csa_int_select_use_interleave(i64 %N, ptr %data, i32 %a) {
 ; AVX512-NEXT:    [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
 ; AVX512-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; AVX512-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; AVX512-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; AVX512-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; AVX512:       [[EXIT]]:
 ; AVX512-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; AVX512-NEXT:    ret i32 [[SELECT_DATA_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
index ab2b24dc51683..6d7a3e7c80f4a 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
@@ -1862,45 +1862,121 @@ define float @not_vectorized_select_float_induction_icmp(ptr %a, ptr %b, float %
 ; CHECK-VF4IC1-LABEL: define float @not_vectorized_select_float_induction_icmp(
 ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], float [[RDX_START:%.*]], i64 [[N:%.*]]) {
 ; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = fmul float 1.000000e+00, [[DOTCAST]]
+; CHECK-VF4IC1-NEXT:    [[TMP13:%.*]] = fadd float 0.000000e+00, [[TMP3]]
+; CHECK-VF4IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[RDX_START]], i64 0
+; CHECK-VF4IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK-VF4IC1:       [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT:    [[FIV:%.*]] = phi float [ [[CONV3:%.*]], %[[FOR_BODY]] ], [ 0.000000e+00, %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi float [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[FOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-VF4IC1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-VF4IC1-NEXT:    [[TMP8]] = select i1 [[TMP7]], <4 x i1> [[TMP5]], <4 x i1> [[TMP2]]
+; CHECK-VF4IC1-NEXT:    [[TMP9]] = select i1 [[TMP7]], <4 x float> [[VEC_IND]], <4 x float> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = fadd <4 x float> [[VEC_IND]], splat (float 4.000000e+00)
+; CHECK-VF4IC1-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC1-NEXT:    [[TMP12:%.*]] = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> [[TMP9]], <4 x i1> [[TMP8]], float [[TMP11]])
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY1:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY1]]:
+; CHECK-VF4IC1-NEXT:    [[IV1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY1]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[FIV:%.*]] = phi float [ [[CONV3:%.*]], %[[FOR_BODY1]] ], [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi float [ [[COND:%.*]], %[[FOR_BODY1]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV1]]
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
 ; CHECK-VF4IC1-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
 ; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP2]], float [[FIV]], float [[RDX]]
 ; CHECK-VF4IC1-NEXT:    [[CONV3]] = fadd float [[FIV]], 1.000000e+00
-; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY1]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK-VF4IC1:       [[EXIT]]:
-; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi float [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi float [ [[COND]], %[[FOR_BODY1]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-VF4IC1-NEXT:    ret float [[COND_LCSSA]]
 ;
 ; CHECK-VF4IC4-LABEL: define float @not_vectorized_select_float_induction_icmp(
 ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], float [[RDX_START:%.*]], i64 [[N:%.*]]) {
 ; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = fmul float 1.000000e+00, [[DOTCAST]]
+; CHECK-VF4IC4-NEXT:    [[TMP13:%.*]] = fadd float 0.000000e+00, [[TMP3]]
+; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[RDX_START]], i64 0
+; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK-VF4IC4:       [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT:    [[FIV:%.*]] = phi float [ [[CONV3:%.*]], %[[FOR_BODY]] ], [ 0.000000e+00, %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi float [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[FOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-VF4IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-VF4IC4-NEXT:    [[TMP8]] = select i1 [[TMP7]], <4 x i1> [[TMP5]], <4 x i1> [[TMP2]]
+; CHECK-VF4IC4-NEXT:    [[TMP9]] = select i1 [[TMP7]], <4 x float> [[VEC_IND]], <4 x float> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = fadd <4 x float> [[VEC_IND]], splat (float 4.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP12:%.*]] = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> [[TMP9]], <4 x i1> [[TMP8]], float [[TMP11]])
+; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY1:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY1]]:
+; CHECK-VF4IC4-NEXT:    [[IV1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY1]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[FIV:%.*]] = phi float [ [[CONV3:%.*]], %[[FOR_BODY1]] ], [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi float [ [[COND:%.*]], %[[FOR_BODY1]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV1]]
 ; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
 ; CHECK-VF4IC4-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
 ; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP2]], float [[FIV]], float [[RDX]]
 ; CHECK-VF4IC4-NEXT:    [[CONV3]] = fadd float [[FIV]], 1.000000e+00
-; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY1]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK-VF4IC4:       [[EXIT]]:
-; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi float [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi float [ [[COND]], %[[FOR_BODY1]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-VF4IC4-NEXT:    ret float [[COND_LCSSA]]
 ;
 ; CHECK-VF1IC4-LABEL: define float @not_vectorized_select_float_induction_icmp(
@@ -1975,7 +2051,7 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP5]])
 ; CHECK-VF4IC1-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
@@ -2001,7 +2077,7 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
 ; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I]], 1
 ; CHECK-VF4IC1-NEXT:    [[INC3]] = add nsw i64 [[IV_J]], 1
 ; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK-VF4IC1:       [[EXIT]]:
 ; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-VF4IC1-NEXT:    ret i64 [[COND_LCSSA]]
@@ -2061,7 +2137,7 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP17]], <4 x i64> [[TMP18]])
 ; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX16:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP19]])
@@ -2093,7 +2169,7 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
 ; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I]], 1
 ; CHECK-VF4IC4-NEXT:    [[INC3]] = add nsw i64 [[IV_J]], 1
 ; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK-VF4IC4:       [[EXIT]]:
 ; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-VF4IC4-NEXT:    ret i64 [[COND_LCSSA]]
@@ -2243,7 +2319,7 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP6]])
 ; CHECK-VF4IC1-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
@@ -2266,7 +2342,7 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
 ; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
 ; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK-VF4IC1:       [[EXIT]]:
 ; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-VF4IC1-NEXT:    ret i64 [[COND_LCSSA]]
@@ -2331,7 +2407,7 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP18]], <4 x i64> [[TMP19]])
 ; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX16:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP20]])
@@ -2360,7 +2436,7 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
 ; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
 ; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK-VF4IC4:       [[EXIT]]:
 ; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-VF4IC4-NEXT:    ret i64 [[COND_LCSSA]]
@@ -2500,7 +2576,7 @@ define i64 @select_icmp_xor_not_iv(ptr %a, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP3]])
 ; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP5]], 9223372036854775807
@@ -2521,7 +2597,7 @@ define i64 @select_icmp_xor_not_iv(ptr %a, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[NOT_IV]], i64 [[RDX]]
 ; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK-VF4IC1:       [[EXIT_LOOPEXIT]]:
 ; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-VF4IC1-NEXT:    br label %[[EXIT]]
@@ -2574,7 +2650,7 @@ define i64 @select_icmp_xor_not_iv(ptr %a, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD_3]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP12]], <4 x i64> [[TMP13]])
 ; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX7:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP14]])
@@ -2598,7 +2674,7 @@ define i64 @select_icmp_xor_not_iv(ptr %a, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[NOT_IV]], i64 [[RDX]]
 ; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK-VF4IC4:       [[EXIT_LOOPEXIT]]:
 ; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-VF4IC4-NEXT:    br label %[[EXIT]]
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
index 83d2e0281dfe1..0e4fedfedc2ca 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -1092,23 +1092,113 @@ exit:                                     ; preds = %loop
 
 ; We don't support FP reduction variables at the moment.
 define float @select_const_f32_from_icmp(ptr %v, i64 %n) {
-; CHECK-LABEL: define float @select_const_f32_from_icmp(
-; CHECK-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi fast float [ 3.000000e+00, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD_V_IV:%.*]] = load i32, ptr [[GEP_V_IV]], align 4
-; CHECK-NEXT:    [[CMP_V_IV_3:%.*]] = icmp eq i32 [[LOAD_V_IV]], 3
-; CHECK-NEXT:    [[SEL]] = select fast i1 [[CMP_V_IV_3]], float [[RDX]], float 7.000000e+00
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi float [ [[SEL]], %[[LOOP]] ]
-; CHECK-NEXT:    ret float [[SEL_LCSSA]]
+; CHECK-VF4IC1-LABEL: define float @select_const_f32_from_icmp(
+; CHECK-VF4IC1-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float 3.000000e+00), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[INDEX]]
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-VF4IC1-NEXT:    [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP6]] = select i1 [[TMP4]], <4 x float> splat (float 7.000000e+00), <4 x float> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> [[TMP6]], <4 x i1> [[TMP5]], float 3.000000e+00)
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 3.000000e+00, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi fast float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[LOAD_V_IV:%.*]] = load i32, ptr [[GEP_V_IV]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP_V_IV_3:%.*]] = icmp eq i32 [[LOAD_V_IV]], 3
+; CHECK-VF4IC1-NEXT:    [[SEL]] = select fast i1 [[CMP_V_IV_3]], float [[RDX]], float 7.000000e+00
+; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[SEL_LCSSA:%.*]] = phi float [ [[SEL]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret float [[SEL_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define float @select_const_f32_from_icmp(
+; CHECK-VF4IC4-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float 3.000000e+00), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[INDEX]]
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-VF4IC4-NEXT:    [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP6]] = select i1 [[TMP4]], <4 x float> splat (float 7.000000e+00), <4 x float> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> [[TMP6]], <4 x i1> [[TMP5]], float 3.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 3.000000e+00, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC4:       [[LOOP]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi fast float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[LOAD_V_IV:%.*]] = load i32, ptr [[GEP_V_IV]], align 4
+; CHECK-VF4IC4-NEXT:    [[CMP_V_IV_3:%.*]] = icmp eq i32 [[LOAD_V_IV]], 3
+; CHECK-VF4IC4-NEXT:    [[SEL]] = select fast i1 [[CMP_V_IV_3]], float [[RDX]], float 7.000000e+00
+; CHECK-VF4IC4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[SEL_LCSSA:%.*]] = phi float [ [[SEL]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret float [[SEL_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define float @select_const_f32_from_icmp(
+; CHECK-VF1IC4-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC4:       [[LOOP]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi fast float [ 3.000000e+00, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[LOAD_V_IV:%.*]] = load i32, ptr [[GEP_V_IV]], align 4
+; CHECK-VF1IC4-NEXT:    [[CMP_V_IV_3:%.*]] = icmp eq i32 [[LOAD_V_IV]], 3
+; CHECK-VF1IC4-NEXT:    [[SEL]] = select fast i1 [[CMP_V_IV_3]], float [[RDX]], float 7.000000e+00
+; CHECK-VF1IC4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[SEL_LCSSA:%.*]] = phi float [ [[SEL]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT:    ret float [[SEL_LCSSA]]
 ;
 entry:
   br label %loop
@@ -1153,7 +1243,7 @@ define i32 @select_variant_i32_from_icmp(ptr %v1, ptr %v2, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[WIDE_LOAD1]], <4 x i32> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 3)
 ; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -1173,7 +1263,7 @@ define i32 @select_variant_i32_from_icmp(ptr %v1, ptr %v2, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]]
 ; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-VF4IC1-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-VF4IC1-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-VF4IC1:       [[EXIT]]:
 ; CHECK-VF4IC1-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-VF4IC1-NEXT:    ret i32 [[SEL_LCSSA]]
@@ -1202,7 +1292,7 @@ define i32 @select_variant_i32_from_icmp(ptr %v1, ptr %v2, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[TMP27]] = select i1 [[TMP19]], <4 x i32> [[WIDE_LOAD13]], <4 x i32> [[VEC_PHI]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF4IC4-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP27]], <4 x i1> [[TMP23]], i32 3)
 ; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -1222,7 +1312,7 @@ define i32 @select_variant_i32_from_icmp(ptr %v1, ptr %v2, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]]
 ; CHECK-VF4IC4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-VF4IC4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-VF4IC4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-VF4IC4:       [[EXIT]]:
 ; CHECK-VF4IC4-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-VF4IC4-NEXT:    ret i32 [[SEL_LCSSA]]
@@ -1319,6 +1409,8 @@ exit:                                     ; preds = %loop
 ; CHECK-VF4IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
 ; CHECK-VF4IC1: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
 ; CHECK-VF4IC1: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
 ;.
 ; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -1336,6 +1428,8 @@ exit:                                     ; preds = %loop
 ; CHECK-VF4IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
 ; CHECK-VF4IC4: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
 ; CHECK-VF4IC4: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
 ;.
 ; CHECK-VF1IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK-VF1IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}

>From 782fa0dd569f47f45ca2b4ad08fb6a36d10179e6 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton at arm.com>
Date: Thu, 26 Feb 2026 15:55:52 +0000
Subject: [PATCH 3/9] Combine FindLast and FFindLast

Floating-point FindLast is not affected by (nor is there any reason for it to be affected by)
Fast Math Flags, and so we can pass in the blank FastMathFlags as for integer recurrence kinds
but specifically allow this Kind to be used for floating-point values.
---
 llvm/include/llvm/Analysis/IVDescriptors.h           |  5 +----
 llvm/lib/Analysis/IVDescriptors.cpp                  | 12 ++----------
 .../Target/AArch64/AArch64TargetTransformInfo.cpp    |  1 -
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp      |  3 ---
 4 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 5495596916b03..592eb2e90c1c6 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -65,9 +65,6 @@ enum class RecurKind {
   FindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y
                   ///< are an integer type, one is the current recurrence value,
                   ///< and the other is an arbitrary value.
-  FFindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y are
-              ///< a floating-point type, one is the current recurrence value,
-              ///< and the other is an arbitrary value.
   // clang-format on
   // TODO: Any_of and FindLast reduction need not be restricted to integer type
   // only.
@@ -288,7 +285,7 @@ class RecurrenceDescriptor {
   ///   select(cmp(),x,y) where one of (x,y) is an arbitrary value and the
   ///   other is a recurrence.
   static bool isFindLastRecurrenceKind(RecurKind Kind) {
-    return Kind == RecurKind::FindLast || Kind == RecurKind::FFindLast;
+    return Kind == RecurKind::FindLast;
   }
 
   static bool isFindRecurrenceKind(RecurKind Kind) {
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index d4b61e0562b27..0ca0ee64cea12 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -503,7 +503,7 @@ bool RecurrenceDescriptor::AddReductionVar(
   // Check the scalar type to handle both scalar and vector types.
   Type *ScalarTy = RecurrenceType->getScalarType();
   if (ScalarTy->isFloatingPointTy()) {
-    if (!isFloatingPointRecurrenceKind(Kind))
+    if (Kind != RecurKind::FindLast && !isFloatingPointRecurrenceKind(Kind))
       return false;
   } else if (ScalarTy->isIntegerTy()) {
     if (!isIntegerRecurrenceKind(Kind))
@@ -938,9 +938,7 @@ RecurrenceDescriptor::isFindPattern(Loop *TheLoop, PHINode *OrigPhi,
                                      m_Value(NonRdxPhi)))))
     return InstDesc(false, I);
 
-  return OrigPhi->getType()->isFloatingPointTy()
-             ? InstDesc(I, RecurKind::FFindLast)
-             : InstDesc(I, RecurKind::FindLast);
+  return InstDesc(I, RecurKind::FindLast);
 }
 
 /// Returns true if the select instruction has users in the compare-and-add
@@ -1123,11 +1121,6 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
     LLVM_DEBUG(dbgs() << "Found a Find reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FFindLast, TheLoop, FMF, RedDes, DB, AC,
-                      DT, SE)) {
-    LLVM_DEBUG(dbgs() << "Found a float Find reduction PHI." << *Phi << "\n");
-    return true;
-  }
   if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT,
                       SE)) {
     LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
@@ -1269,7 +1262,6 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
   case RecurKind::FMinimumNum:
     return Instruction::FCmp;
   case RecurKind::FindLast:
-  case RecurKind::FFindLast:
   case RecurKind::AnyOf:
   case RecurKind::FindIV:
     // TODO: Set AnyOf and FindIV to Instruction::Select once in-loop reductions
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index f8cba1ac567fb..b96d251a9c4d9 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5562,7 +5562,6 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
   case RecurKind::FMulAdd:
   case RecurKind::AnyOf:
   case RecurKind::FindLast:
-  case RecurKind::FFindLast:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 59645c53d9548..3a10d2b2a7158 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -26923,7 +26923,6 @@ class HorizontalReduction {
         case RecurKind::AnyOf:
         case RecurKind::FindIV:
         case RecurKind::FindLast:
-        case RecurKind::FFindLast:
         case RecurKind::FMaxNum:
         case RecurKind::FMinNum:
         case RecurKind::FMaximumNum:
@@ -27066,7 +27065,6 @@ class HorizontalReduction {
     case RecurKind::AnyOf:
     case RecurKind::FindIV:
     case RecurKind::FindLast:
-    case RecurKind::FFindLast:
     case RecurKind::FMaxNum:
     case RecurKind::FMinNum:
     case RecurKind::FMaximumNum:
@@ -27171,7 +27169,6 @@ class HorizontalReduction {
     case RecurKind::AnyOf:
     case RecurKind::FindIV:
     case RecurKind::FindLast:
-    case RecurKind::FFindLast:
     case RecurKind::FMaxNum:
     case RecurKind::FMinNum:
     case RecurKind::FMaximumNum:

>From b8c82cbb0bf162a1865431a8dfa2ff4d6daaa336 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton at arm.com>
Date: Fri, 27 Feb 2026 17:49:11 +0000
Subject: [PATCH 4/9] Address review comments

---
 llvm/lib/Analysis/IVDescriptors.cpp           |   5 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |   8 +
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   9 +-
 .../CostModel/AArch64/extract-last-active.ll  |  12 +-
 .../RISCV/select-cmp-reduction.ll             | 254 +++++++++---------
 .../Transforms/LoopVectorize/select-cmp.ll    |   5 +-
 6 files changed, 150 insertions(+), 143 deletions(-)

diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 0ca0ee64cea12..3e0ea80001261 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -509,10 +509,7 @@ bool RecurrenceDescriptor::AddReductionVar(
     if (!isIntegerRecurrenceKind(Kind))
       return false;
     Start = lookThroughAnd(Phi, RecurrenceType, VisitedInsts, CastInsts);
-  } else if (ScalarTy->isPointerTy()) {
-    if (Kind != RecurKind::FindLast)
-      return false;
-  } else {
+  } else if (!ScalarTy->isPointerTy() || Kind != RecurKind::FindLast) {
     // Pointer min/max may exist, but it is not supported as a reduction op.
     return false;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b96d251a9c4d9..8afdf5fc44bfb 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -626,6 +626,14 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
     if (VTy->getElementCount() == ElementCount::getScalable(1))
       return InstructionCost::getInvalid();
+  // Likewise if we return a scalar but the first operand is <vscale x 1 x
+  // eltty>.
+  if (ICA.getArgTypes().size()) {
+    Type *Ty = ICA.getArgTypes()[0];
+    if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
+      if (VTy->getElementCount() == ElementCount::getScalable(1))
+        return InstructionCost::getInvalid();
+  }
 
   switch (ICA.getID()) {
   case Intrinsic::experimental_vector_histogram_add: {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ed685e99c5a51..f8682941683f0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5704,6 +5704,8 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
     return std::nullopt;
   };
 
+  VPTypeAnalysis TypeInfo(Plan);
+
   VPValue *HeaderMask = vputils::findHeaderMask(Plan);
   for (VPRecipeBase &Phi :
        make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
@@ -5712,6 +5714,10 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
                      PhiR->getRecurrenceKind()))
       continue;
 
+    Type *PhiTy = TypeInfo.inferScalarType(PhiR);
+    if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
+      continue;
+
     // If there's a header mask, the backedge select will not be the find-last
     // select.
     VPValue *BackedgeVal = PhiR->getBackedgeValue();
@@ -5741,9 +5747,6 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
     if (!SE.isKnownNonZero(Step))
       continue;
 
-    if (IVSCEV->getType()->isPointerTy())
-      continue;
-
     // Positive step means we need UMax/SMax to find the last IV value, and
     // UMin/SMin otherwise.
     bool UseMax = SE.isKnownPositive(Step);
diff --git a/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll b/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll
index 96531a6dada69..d81d343239612 100644
--- a/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll
@@ -110,12 +110,12 @@ define void @extractions() {
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
-; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
+; SVE-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
-; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
-; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv1p0(<vscale x 1 x ptr> poison, <vscale x 1 x i1> poison, ptr poison)
+; SVE-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Invalid cost for instruction: %nxv1p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv1p0(<vscale x 1 x ptr> poison, <vscale x 1 x i1> poison, ptr poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SME-STREAMING-LABEL: 'extractions'
@@ -167,12 +167,12 @@ define void @extractions() {
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
-; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
+; SME-STREAMING-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
-; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
-; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv1p0(<vscale x 1 x ptr> poison, <vscale x 1 x i1> poison, ptr poison)
+; SME-STREAMING-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Invalid cost for instruction: %nxv1p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv1p0(<vscale x 1 x ptr> poison, <vscale x 1 x i1> poison, ptr poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
index fd118aaf81a02..f3a6bcc15c392 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
@@ -4,32 +4,32 @@
 define i32 @select_icmp(i32 %x, i32 %y, ptr nocapture readonly %c, i64 %n) {
 ; CHECK-LABEL: define i32 @select_icmp(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr readonly captures(none) [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[VECTOR_PH:.*:]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    br label %[[VECTOR_BODY1:.*]]
-; CHECK:       [[VECTOR_BODY1]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_BODY]] ], [ [[CURRENT_ITERATION_NEXT:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_BODY]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sge <vscale x 4 x i32> [[VP_OP_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP3]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[CURRENT_ITERATION_NEXT]] = add i64 [[TMP4]], [[INDEX]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[FOR_END:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sge <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP8]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP14]])
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP14]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = freeze i1 [[TMP10]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 [[Y]], i32 0
+; CHECK-NEXT:    br label %[[FOR_END:.*]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP7:%.*]] = freeze i1 [[TMP8]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 [[Y]], i32 0
-; CHECK-NEXT:    br label %[[FOR_END1:.*]]
-; CHECK:       [[FOR_END1]]:
 ; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
@@ -53,32 +53,32 @@ for.end:
 define i32 @select_fcmp(float %x, i32 %y, ptr nocapture readonly %c, i64 %n) {
 ; CHECK-LABEL: define i32 @select_fcmp(
 ; CHECK-SAME: float [[X:%.*]], i32 [[Y:%.*]], ptr readonly captures(none) [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[VECTOR_PH:.*:]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[X]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    br label %[[VECTOR_BODY1:.*]]
-; CHECK:       [[VECTOR_BODY1]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_BODY]] ], [ [[CURRENT_ITERATION_NEXT:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_BODY]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = fcmp fast uge <vscale x 4 x float> [[VP_OP_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP3]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[CURRENT_ITERATION_NEXT]] = add i64 [[TMP4]], [[INDEX]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[FOR_END:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp fast uge <vscale x 4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP8]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP14]])
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP14]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = freeze i1 [[TMP10]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 [[Y]], i32 0
+; CHECK-NEXT:    br label %[[FOR_END:.*]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP7:%.*]] = freeze i1 [[TMP8]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 [[Y]], i32 0
-; CHECK-NEXT:    br label %[[FOR_END1:.*]]
-; CHECK:       [[FOR_END1]]:
 ; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
@@ -102,30 +102,30 @@ for.end:
 define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-LABEL: define i32 @select_const_i32_from_icmp(
 ; CHECK-SAME: ptr readonly captures(none) [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[VECTOR_PH:.*:]]
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    br label %[[VECTOR_BODY1:.*]]
-; CHECK:       [[VECTOR_BODY1]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_BODY]] ], [ [[CURRENT_ITERATION_NEXT:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_BODY]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
-; CHECK-NEXT:    [[TMP3]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[CURRENT_ITERATION_NEXT]] = add i64 [[TMP4]], [[INDEX]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[EXIT:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP21]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-NEXT:    [[TMP8]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP21]])
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP21]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = freeze i1 [[TMP10]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 7, i32 3
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP7:%.*]] = freeze i1 [[TMP8]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 7, i32 3
-; CHECK-NEXT:    br label %[[EXIT1:.*]]
-; CHECK:       [[EXIT1]]:
 ; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
@@ -149,30 +149,30 @@ exit:                                     ; preds = %for.body
 define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) {
 ; CHECK-LABEL: define i32 @select_i32_from_icmp(
 ; CHECK-SAME: ptr readonly captures(none) [[V:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[VECTOR_PH:.*:]]
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    br label %[[VECTOR_BODY1:.*]]
-; CHECK:       [[VECTOR_BODY1]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_BODY]] ], [ [[CURRENT_ITERATION_NEXT:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_BODY]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
-; CHECK-NEXT:    [[TMP3]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[CURRENT_ITERATION_NEXT]] = add i64 [[TMP4]], [[INDEX]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[EXIT:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP21]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-NEXT:    [[TMP8]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP21]])
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP21]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = freeze i1 [[TMP10]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 [[B]], i32 [[A]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP7:%.*]] = freeze i1 [[TMP8]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 [[B]], i32 [[A]]
-; CHECK-NEXT:    br label %[[EXIT1:.*]]
-; CHECK:       [[EXIT1]]:
 ; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
@@ -196,30 +196,30 @@ exit:                                     ; preds = %for.body
 define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-LABEL: define i32 @select_const_i32_from_fcmp(
 ; CHECK-SAME: ptr readonly captures(none) [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[VECTOR_PH:.*:]]
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    br label %[[VECTOR_BODY1:.*]]
-; CHECK:       [[VECTOR_BODY1]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_BODY]] ], [ [[CURRENT_ITERATION_NEXT:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_BODY]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = fcmp fast one <vscale x 4 x float> [[VP_OP_LOAD]], splat (float 3.000000e+00)
-; CHECK-NEXT:    [[TMP3]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[CURRENT_ITERATION_NEXT]] = add i64 [[TMP4]], [[INDEX]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[EXIT:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP21]])
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp fast one <vscale x 4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
+; CHECK-NEXT:    [[TMP8]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP21]])
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP21]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = freeze i1 [[TMP10]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 1, i32 2
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP7:%.*]] = freeze i1 [[TMP8]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 1, i32 2
-; CHECK-NEXT:    br label %[[EXIT1:.*]]
-; CHECK:       [[EXIT1]]:
 ; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
@@ -307,33 +307,33 @@ exit:                                     ; preds = %for.body
 define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i64 %n) {
 ; CHECK-LABEL: define i32 @pred_select_const_i32_from_icmp(
 ; CHECK-SAME: ptr noalias readonly captures(none) [[SRC1:%.*]], ptr noalias readonly captures(none) [[SRC2:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[VECTOR_PH:.*:]]
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    br label %[[VECTOR_BODY1:.*]]
-; CHECK:       [[VECTOR_BODY1]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_BODY]] ], [ [[CURRENT_ITERATION_NEXT:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_BODY]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 35)
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP3]], <vscale x 4 x i1> [[TMP2]], i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <vscale x 4 x i32> [[VP_OP_LOAD1]], splat (i32 2)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP4]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i1> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP11]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[PREDPHI]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[CURRENT_ITERATION_NEXT]] = add i64 [[TMP7]], [[INDEX]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP17]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 35)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> [[TMP7]], i32 [[TMP17]])
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 2)
+; CHECK-NEXT:    [[TMP10:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP9]]
+; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[VEC_PHI]]
+; CHECK-NEXT:    [[PREDPHI]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[PREDPHI1]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP17]])
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[INDEX]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP21]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP11]])
-; CHECK-NEXT:    [[TMP10:%.*]] = freeze i1 [[TMP9]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP10]], i32 1, i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[PREDPHI]])
+; CHECK-NEXT:    [[TMP13:%.*]] = freeze i1 [[TMP12]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 1, i32 0
 ; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT:.*]]
 ; CHECK:       [[FOR_END_LOOPEXIT]]:
 ; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
index 0e4fedfedc2ca..dc2961e3a7778 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -1088,9 +1088,6 @@ exit:                                     ; preds = %loop
   ret i32 %sel
 }
 
-;; Negative tests
-
-; We don't support FP reduction variables at the moment.
 define float @select_const_f32_from_icmp(ptr %v, i64 %n) {
 ; CHECK-VF4IC1-LABEL: define float @select_const_f32_from_icmp(
 ; CHECK-VF4IC1-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
@@ -1357,6 +1354,8 @@ exit:                                     ; preds = %loop
   ret i32 %sel
 }
 
+;; Negative tests
+
 ; We only support selects where the input comes from the same PHI as the
 ; reduction PHI. In the example below, the select uses the induction
 ; variable input and the icmp uses the reduction PHI.

>From 0008e4971360b725f49f62ac0990bd0b59732ef0 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Mon, 2 Mar 2026 10:06:23 +0000
Subject: [PATCH 5/9] Minor changes

* Remove two NFC changes from the PR (isFindLastRecurrenceKind -> Kind == RecurKind::FindLast)
* Make code that returns Invalid cost for nxv1* types for intrinsics more generic than just the first operand.
* Refactor condition in `AddReductionVar` to make logic a bit simpler.
---
 llvm/lib/Analysis/IVDescriptors.cpp               | 15 ++++++++++-----
 .../Target/AArch64/AArch64TargetTransformInfo.cpp | 11 +++++------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 3e0ea80001261..13bee9992cb95 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -502,14 +502,19 @@ bool RecurrenceDescriptor::AddReductionVar(
   // to evaluate the reduction in the narrower width.
   // Check the scalar type to handle both scalar and vector types.
   Type *ScalarTy = RecurrenceType->getScalarType();
-  if (ScalarTy->isFloatingPointTy()) {
-    if (Kind != RecurKind::FindLast && !isFloatingPointRecurrenceKind(Kind))
+  if (Kind == RecurKind::FindLast) {
+    // FindLast supports all types can otherwise be used in recurrences.
+    if (!ScalarTy->isFloatingPointTy() && !ScalarTy->isIntegerTy() &&
+        !ScalarTy->isPointerTy())
+      return false;
+  } else if (ScalarTy->isFloatingPointTy()) {
+    if (!isFloatingPointRecurrenceKind(Kind))
       return false;
   } else if (ScalarTy->isIntegerTy()) {
     if (!isIntegerRecurrenceKind(Kind))
       return false;
     Start = lookThroughAnd(Phi, RecurrenceType, VisitedInsts, CastInsts);
-  } else if (!ScalarTy->isPointerTy() || Kind != RecurKind::FindLast) {
+  } else {
     // Pointer min/max may exist, but it is not supported as a reduction op.
     return false;
   }
@@ -648,7 +653,7 @@ bool RecurrenceDescriptor::AddReductionVar(
         // "FindLast-like" phis are not supported see:
         // IVDescriptorsTest.UnsupportedFindLastPhi.
         FoundFindLastLikePhi =
-            isFindLastRecurrenceKind(Kind) && !FoundFindLastLikePhi &&
+            Kind == RecurKind::FindLast && !FoundFindLastLikePhi &&
             isFindLastLikePhi(cast<PHINode>(Cur), Phi, VisitedInsts);
         if (!FoundFindLastLikePhi)
           return false;
@@ -733,7 +738,7 @@ bool RecurrenceDescriptor::AddReductionVar(
   // We only expect to match a single "find-last-like" phi per find-last
   // reduction, with no non-phi operations in the reduction use chain.
   assert((!FoundFindLastLikePhi ||
-          (isFindLastRecurrenceKind(Kind) && NumNonPHIUsers == 0)) &&
+          (Kind == RecurKind::FindLast && NumNonPHIUsers == 0)) &&
          "Unexpectedly matched a 'find-last-like' phi");
 
   if (isAnyOfRecurrenceKind(Kind) && NumCmpSelectPatternInst != 1)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 8afdf5fc44bfb..fd8868b1f4df3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -626,12 +626,11 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
     if (VTy->getElementCount() == ElementCount::getScalable(1))
       return InstructionCost::getInvalid();
-  // Likewise if we return a scalar but the first operand is <vscale x 1 x
-  // eltty>.
-  if (ICA.getArgTypes().size()) {
-    Type *Ty = ICA.getArgTypes()[0];
-    if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
-      if (VTy->getElementCount() == ElementCount::getScalable(1))
+
+  // Likewise if any of the operands is a <vscale x 1 x eltty>.
+  for (const Type *ArgTy : ICA.getArgTypes()) {
+    if (auto *ArgVTy = dyn_cast<ScalableVectorType>(ArgTy))
+      if (ArgVTy->getElementCount() == ElementCount::getScalable(1))
         return InstructionCost::getInvalid();
   }
 

>From 7257411c1915c0ccf09e7e17cc1b535bae087837 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Mon, 2 Mar 2026 11:31:02 +0000
Subject: [PATCH 6/9] Fix comment

---
 llvm/lib/Analysis/IVDescriptors.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 13bee9992cb95..79795127e97d2 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -503,7 +503,7 @@ bool RecurrenceDescriptor::AddReductionVar(
   // Check the scalar type to handle both scalar and vector types.
   Type *ScalarTy = RecurrenceType->getScalarType();
   if (Kind == RecurKind::FindLast) {
-    // FindLast supports all types can otherwise be used in recurrences.
+    // FindLast supports all primitive scalar types.
     if (!ScalarTy->isFloatingPointTy() && !ScalarTy->isIntegerTy() &&
         !ScalarTy->isPointerTy())
       return false;

>From 4f6c43eec64eca374ed4eb1211298710c0ee6f5c Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Wed, 4 Mar 2026 13:49:39 +0000
Subject: [PATCH 7/9] Remove any changes related to the cost-model for <vscale
 x 1 ..> types.

---
 .../Target/AArch64/AArch64TargetTransformInfo.cpp   |  7 -------
 .../CostModel/AArch64/extract-last-active.ll        | 13 ++++---------
 2 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index fd8868b1f4df3..b96d251a9c4d9 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -627,13 +627,6 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     if (VTy->getElementCount() == ElementCount::getScalable(1))
       return InstructionCost::getInvalid();
 
-  // Likewise if any of the operands is a <vscale x 1 x eltty>.
-  for (const Type *ArgTy : ICA.getArgTypes()) {
-    if (auto *ArgVTy = dyn_cast<ScalableVectorType>(ArgTy))
-      if (ArgVTy->getElementCount() == ElementCount::getScalable(1))
-        return InstructionCost::getInvalid();
-  }
-
   switch (ICA.getID()) {
   case Intrinsic::experimental_vector_histogram_add: {
     InstructionCost HistCost = getHistogramCost(ST, ICA);
diff --git a/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll b/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll
index d81d343239612..a9765d757b378 100644
--- a/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll
@@ -58,7 +58,6 @@ define void @extractions() {
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
-; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv1p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv1p0(<vscale x 1 x ptr> poison, <vscale x 1 x i1> poison, ptr poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SVE-LABEL: 'extractions'
@@ -110,12 +109,11 @@ define void @extractions() {
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
-; SVE-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
-; SVE-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
-; SVE-NEXT:  Cost Model: Invalid cost for instruction: %nxv1p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv1p0(<vscale x 1 x ptr> poison, <vscale x 1 x i1> poison, ptr poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SME-STREAMING-LABEL: 'extractions'
@@ -167,12 +165,11 @@ define void @extractions() {
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
-; SME-STREAMING-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
-; SME-STREAMING-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
-; SME-STREAMING-NEXT:  Cost Model: Invalid cost for instruction: %nxv1p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv1p0(<vscale x 1 x ptr> poison, <vscale x 1 x i1> poison, ptr poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
@@ -234,8 +231,6 @@ define void @extractions() {
   %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
   %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
   %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
-  %nxv1p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv1p0(<vscale x 1 x ptr> poison, <vscale x 1 x i1> poison, ptr poison)
-
 
   ret void
 }

>From 7a5f586f3c712ee337a764cd19e58da4c20ad0f5 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Wed, 4 Mar 2026 16:41:26 +0000
Subject: [PATCH 8/9] Remove cost-model checks entirely

---
 .../CostModel/AArch64/extract-last-active.ll  | 20 -------------------
 1 file changed, 20 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll b/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll
index a9765d757b378..9efcf912076b0 100644
--- a/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll
@@ -13,7 +13,6 @@ define void @extractions() {
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2p0 = call ptr @llvm.experimental.vector.extract.last.active.v2p0(<2 x ptr> poison, <2 x i1> poison, ptr poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
@@ -22,7 +21,6 @@ define void @extractions() {
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
-; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv2p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv2p0(<vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, ptr poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
@@ -31,7 +29,6 @@ define void @extractions() {
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4p0 = call ptr @llvm.experimental.vector.extract.last.active.v4p0(<4 x ptr> poison, <4 x i1> poison, ptr poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
@@ -40,7 +37,6 @@ define void @extractions() {
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
-; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv4p0(<vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, ptr poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
@@ -49,7 +45,6 @@ define void @extractions() {
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1p0 = call ptr @llvm.experimental.vector.extract.last.active.v1p0(<1 x ptr> poison, <1 x i1> poison, ptr poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
 ; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
@@ -69,7 +64,6 @@ define void @extractions() {
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
-; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2p0 = call ptr @llvm.experimental.vector.extract.last.active.v2p0(<2 x ptr> poison, <2 x i1> poison, ptr poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
@@ -78,7 +72,6 @@ define void @extractions() {
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
-; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv2p0(<vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, ptr poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
@@ -87,7 +80,6 @@ define void @extractions() {
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
-; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4p0 = call ptr @llvm.experimental.vector.extract.last.active.v4p0(<4 x ptr> poison, <4 x i1> poison, ptr poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
@@ -96,7 +88,6 @@ define void @extractions() {
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
-; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv4p0(<vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, ptr poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
@@ -105,7 +96,6 @@ define void @extractions() {
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
-; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1p0 = call ptr @llvm.experimental.vector.extract.last.active.v1p0(<1 x ptr> poison, <1 x i1> poison, ptr poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
@@ -125,7 +115,6 @@ define void @extractions() {
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
-; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2p0 = call ptr @llvm.experimental.vector.extract.last.active.v2p0(<2 x ptr> poison, <2 x i1> poison, ptr poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
@@ -134,7 +123,6 @@ define void @extractions() {
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
-; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv2p0(<vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, ptr poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
@@ -143,7 +131,6 @@ define void @extractions() {
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
-; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4p0 = call ptr @llvm.experimental.vector.extract.last.active.v4p0(<4 x ptr> poison, <4 x i1> poison, ptr poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
@@ -152,7 +139,6 @@ define void @extractions() {
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
-; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv4p0(<vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, ptr poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
@@ -161,7 +147,6 @@ define void @extractions() {
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
-; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1p0 = call ptr @llvm.experimental.vector.extract.last.active.v1p0(<1 x ptr> poison, <1 x i1> poison, ptr poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
 ; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
@@ -182,7 +167,6 @@ define void @extractions() {
   %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
   %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
   %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
-  %v2p0 = call ptr @llvm.experimental.vector.extract.last.active.v2p0(<2 x ptr> poison, <2 x i1> poison, ptr poison)
   %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
   %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
   %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
@@ -191,7 +175,6 @@ define void @extractions() {
   %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
   %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
   %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
-  %nxv2p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv2p0(<vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, ptr poison)
 
   ;; Wider-than-legal
   %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
@@ -202,7 +185,6 @@ define void @extractions() {
   %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
   %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
   %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
-  %v4p0 = call ptr @llvm.experimental.vector.extract.last.active.v4p0(<4 x ptr> poison, <4 x i1> poison, ptr poison)
   %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
   %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
   %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
@@ -211,7 +193,6 @@ define void @extractions() {
   %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
   %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
   %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
-  %nxv4p0 = call ptr @llvm.experimental.vector.extract.last.active.nxv4p0(<vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, ptr poison)
 
   ;; Narrower-than-legal
   %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
@@ -222,7 +203,6 @@ define void @extractions() {
   %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
   %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
   %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
-  %v1p0 = call ptr @llvm.experimental.vector.extract.last.active.v1p0(<1 x ptr> poison, <1 x i1> poison, ptr poison)
   %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
   %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
   %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)

>From e820cfd44f9165ee1ad846f9deb423b689a901b3 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Thu, 5 Mar 2026 11:17:58 +0000
Subject: [PATCH 9/9] Remove VPTypeAnalysis variable

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f8682941683f0..8b3678a68d9b0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5704,8 +5704,6 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
     return std::nullopt;
   };
 
-  VPTypeAnalysis TypeInfo(Plan);
-
   VPValue *HeaderMask = vputils::findHeaderMask(Plan);
   for (VPRecipeBase &Phi :
        make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
@@ -5714,7 +5712,7 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
                      PhiR->getRecurrenceKind()))
       continue;
 
-    Type *PhiTy = TypeInfo.inferScalarType(PhiR);
+    Type *PhiTy = VPTypeAnalysis(Plan).inferScalarType(PhiR);
     if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
       continue;