[llvm] [SVE] Wide active lane mask (PR #76514)

Momchil Velikov via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 19 01:49:32 PST 2024


https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/76514

>From a6cef256a6f80ff51ea8bbe6bf886c9e5248a721 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 22 Dec 2023 11:30:22 +0000
Subject: [PATCH 1/3] [AArch64] Allow active lane mask with 32 elements

Support lowering of `llvm.get.active.lane.mask.*` yielding a vector with
32 elements when the result is immediately split in half.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 125 +++++++++++++-----
 llvm/test/CodeGen/AArch64/active_lane_mask.ll |   1 +
 .../AArch64/get-active-lane-mask-32x1.ll      |  32 +++++
 3 files changed, 122 insertions(+), 36 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/get-active-lane-mask-32x1.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8a6f1dc7487bae8..64a6aa681ade9f6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1776,15 +1776,17 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
 
 bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
                                                           EVT OpVT) const {
-  // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
-  if (!Subtarget->hasSVE())
+  // Only SVE/SME has a 1:1 mapping from intrinsic -> instruction (whilelo).
+  if (!Subtarget->hasSVEorSME())
     return true;
 
   // We can only support legal predicate result types. We can use the SVE
   // whilelo instruction for generating fixed-width predicates too.
   if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
       ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
-      ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
+      ResVT != MVT::v8i1 && ResVT != MVT::v16i1 &&
+      (!(Subtarget->hasSVE2p1() || Subtarget->hasSME2()) ||
+       ResVT != MVT::nxv32i1)) // TODO: handle MVT::v32i1
     return true;
 
   // The whilelo instruction only works with i32 or i64 scalar inputs.
@@ -19797,47 +19799,98 @@ static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
   return SDValue();
 }
 
-static SDValue performIntrinsicCombine(SDNode *N,
-                                       TargetLowering::DAGCombinerInfo &DCI,
-                                       const AArch64Subtarget *Subtarget) {
+static SDValue tryCombineGetActiveLaneMask(SDNode *N,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           const AArch64Subtarget *Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
-  unsigned IID = getIntrinsicID(N);
-  switch (IID) {
-  default:
-    break;
-  case Intrinsic::get_active_lane_mask: {
-    SDValue Res = SDValue();
-    EVT VT = N->getValueType(0);
-    if (VT.isFixedLengthVector()) {
-      // We can use the SVE whilelo instruction to lower this intrinsic by
-      // creating the appropriate sequence of scalable vector operations and
-      // then extracting a fixed-width subvector from the scalable vector.
+  EVT VT = N->getValueType(0);
+  if (VT.isFixedLengthVector()) {
+    // We can use the SVE whilelo instruction to lower this intrinsic by
+    // creating the appropriate sequence of scalable vector operations and
+    // then extracting a fixed-width subvector from the scalable vector.
+    SDLoc DL(N);
+    SDValue ID =
+        DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
 
-      SDLoc DL(N);
-      SDValue ID =
-          DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
+    EVT WhileVT =
+        EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                         ElementCount::getScalable(VT.getVectorNumElements()));
 
-      EVT WhileVT = EVT::getVectorVT(
-          *DAG.getContext(), MVT::i1,
-          ElementCount::getScalable(VT.getVectorNumElements()));
+    // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
+    EVT PromVT = getPromotedVTForPredicate(WhileVT);
 
-      // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
-      EVT PromVT = getPromotedVTForPredicate(WhileVT);
+    // Get the fixed-width equivalent of PromVT for extraction.
+    EVT ExtVT =
+        EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
+                         VT.getVectorElementCount());
 
-      // Get the fixed-width equivalent of PromVT for extraction.
-      EVT ExtVT =
-          EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
-                           VT.getVectorElementCount());
+    SDValue Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
+                              N->getOperand(1), N->getOperand(2));
+    Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
+    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
+                      DAG.getConstant(0, DL, MVT::i64));
+    Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
 
-      Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
-                        N->getOperand(1), N->getOperand(2));
-      Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
-      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
-                        DAG.getConstant(0, DL, MVT::i64));
-      Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
-    }
     return Res;
   }
+
+  if (!Subtarget->hasSVE2p1() && !Subtarget->hasSME2())
+    return SDValue();
+
+  if (!N->hasNUsesOfValue(2, 0))
+    return SDValue();
+
+  auto It = N->use_begin();
+  SDNode *Lo = *It++;
+  SDNode *Hi = *It;
+
+  const uint64_t HalfSize = VT.getVectorMinNumElements() / 2;
+  uint64_t OffLo, OffHi;
+  if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+      !isIntImmediate(Lo->getOperand(1).getNode(), OffLo) ||
+      (OffLo != 0 && OffLo != HalfSize) ||
+      Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+      !isIntImmediate(Hi->getOperand(1).getNode(), OffHi) ||
+      (OffHi != 0 && OffHi != HalfSize))
+    return SDValue();
+
+  if (OffLo > OffHi) {
+    std::swap(Lo, Hi);
+    std::swap(OffLo, OffHi);
+  }
+
+  if (OffLo != 0 || OffHi != HalfSize)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue ID =
+      DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
+  SDValue Idx = N->getOperand(1);
+  SDValue TC = N->getOperand(2);
+  if (Idx.getValueType() != MVT::i64) {
+    Idx = DAG.getZExtOrTrunc(Idx, DL, MVT::i64);
+    TC = DAG.getZExtOrTrunc(TC, DL, MVT::i64);
+  }
+  auto R =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL,
+                  {Lo->getValueType(0), Hi->getValueType(0)}, {ID, Idx, TC});
+
+  DCI.CombineTo(Lo, R.getValue(0));
+  DCI.CombineTo(Hi, R.getValue(1));
+
+  return SDValue(N, 0);
+}
+
+static SDValue performIntrinsicCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       const AArch64Subtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned IID = getIntrinsicID(N);
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::get_active_lane_mask:
+    return tryCombineGetActiveLaneMask(N, DCI, Subtarget);
   case Intrinsic::aarch64_neon_vcvtfxs2fp:
   case Intrinsic::aarch64_neon_vcvtfxu2fp:
     return tryCombineFixedPointConvert(N, DCI, DAG);
diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index a65c5d66677946a..6a509b5f3afcae4 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
 
 ; == Scalable ==
 
diff --git a/llvm/test/CodeGen/AArch64/get-active-lane-mask-32x1.ll b/llvm/test/CodeGen/AArch64/get-active-lane-mask-32x1.ll
new file mode 100644
index 000000000000000..eac8d7f90a89d8b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/get-active-lane-mask-32x1.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s
+; RUN: llc -mattr=+sme2   < %s | FileCheck %s
+target triple = "aarch64-linux"
+
+; Test lowering of 32-element active lane mask (the return statement implicitly
+; creates a pair of extract_vector operations).
+
+define <vscale x 32 x i1> @f0(i32 %i, i32 %n) #0 {
+; CHECK-LABEL: f0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w1
+; CHECK-NEXT:    mov w9, w0
+; CHECK-NEXT:    whilelo { p0.b, p1.b }, x9, x8
+; CHECK-NEXT:    ret
+    %r = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i32(i32 %i, i32 %n)
+    ret <vscale x 32 x i1> %r
+}
+
+define <vscale x 32 x i1> @g0(i64 %i, i64 %n) #0 {
+; CHECK-LABEL: g0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilelo { p0.b, p1.b }, x0, x1
+; CHECK-NEXT:    ret
+    %r = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 %i, i64 %n)
+    ret <vscale x 32 x i1> %r
+}
+
+declare <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i32(i32, i32)
+declare <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64, i64)
+
+attributes #0 = { nounwind }

>From fa25c50f957ffb6dffe04828291df1c100f484f3 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Wed, 27 Dec 2023 14:47:06 +0000
Subject: [PATCH 2/3] [LoopVectorize][AArch64][SVE] Generate wide active lane
 masks

This patch makes the LoopVectorize generate lane masks longer than
the VF to allow the target to better utilise the instruction set.
The vectorizer emit one or more wide `llvm.get.active.lane.mask.*`
calls plus several `llvm.vector.extract.*` calls to yield the required
number of VF-wide masks.

The motivating exammple is a vectorised loop with unroll factor 2 that
can use the SVE2.1 `whilelo` instruction with predicate pair result, or
a SVE `whilelo` instruction with smaller element size plus
`punpklo`/`punpkhi`.

How wide is the lane mask that the vectoriser emits is controlled
by a TargetTransformInfo hook `getMaxPredicateLength`.The default
impementation (return the same length as the VF) keeps the
change non-functional for targets that can't or are not prepared
to handle wider lane masks.
---
 .../llvm/Analysis/TargetTransformInfo.h       |   10 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |    2 +
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |    2 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |    4 +
 .../AArch64/AArch64TargetTransformInfo.cpp    |    8 +
 .../AArch64/AArch64TargetTransformInfo.h      |    2 +
 .../Vectorize/LoopVectorizationPlanner.h      |    8 +
 .../Transforms/Vectorize/LoopVectorize.cpp    |    7 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   39 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   84 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   18 +-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |    1 +
 .../CodeGen/AArch64/sve-wide-lane-mask.ll     | 1015 ++++++++++
 .../AArch64/scalable-strict-fadd.ll           | 1768 +++++++++--------
 .../AArch64/sve-tail-folding-unroll.ll        |  386 ++--
 .../AArch64/sve-wide-lane-mask.ll             |  655 ++++++
 .../ARM/tail-folding-prefer-flag.ll           |    6 +-
 .../strict-fadd-interleave-only.ll            |   20 +-
 18 files changed, 2924 insertions(+), 1111 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 9697278eaeaee29..d576ee3f2eb9d3b 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1219,6 +1219,8 @@ class TargetTransformInfo {
   /// and the number of execution units in the CPU.
   unsigned getMaxInterleaveFactor(ElementCount VF) const;
 
+  ElementCount getMaxPredicateLength(ElementCount VF) const;
+
   /// Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
   static OperandValueInfo getOperandInfo(const Value *V);
 
@@ -1956,6 +1958,9 @@ class TargetTransformInfo::Concept {
   virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;
 
   virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0;
+
+  virtual ElementCount getMaxPredicateLength(ElementCount VF) const = 0;
+
   virtual InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       OperandValueInfo Opd1Info, OperandValueInfo Opd2Info,
@@ -2565,6 +2570,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getMaxInterleaveFactor(ElementCount VF) override {
     return Impl.getMaxInterleaveFactor(VF);
   }
+
+  ElementCount getMaxPredicateLength(ElementCount VF) const override {
+    return Impl.getMaxPredicateLength(VF);
+  }
+
   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
                                             unsigned &JTSize,
                                             ProfileSummaryInfo *PSI,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 60eab53fa2f6019..306765ce156e6ba 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -515,6 +515,8 @@ class TargetTransformInfoImplBase {
 
   unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
 
+  ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }
+
   InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info,
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 5e7bdcdf72a49f8..fcad6a86538256f 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -881,6 +881,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
   unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
 
+  ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }
+
   InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None},
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index a5a18a538d76899..023bcaa61e18854 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -798,6 +798,10 @@ unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
   return TTIImpl->getMaxInterleaveFactor(VF);
 }
 
+ElementCount TargetTransformInfo::getMaxPredicateLength(ElementCount VF) const {
+  return TTIImpl->getMaxPredicateLength(VF);
+}
+
 TargetTransformInfo::OperandValueInfo
 TargetTransformInfo::getOperandInfo(const Value *V) {
   OperandValueKind OpInfo = OK_AnyValue;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d358a5c8bd9499f..329f66aac76fb67 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3276,6 +3276,14 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
   return ST->getMaxInterleaveFactor();
 }
 
+ElementCount AArch64TTIImpl::getMaxPredicateLength(ElementCount VF) const {
+  // Do not create masks that are more than twice the VF.
+  unsigned N = ST->hasSVE2p1() ? 32 : ST->hasSVE() ? 16 : 0;
+  N = std::min(N, 2 * VF.getKnownMinValue());
+  return VF.isScalable() ? ElementCount::getScalable(N)
+                         : ElementCount::getFixed(N);
+}
+
 // For Falkor, we want to avoid having too many strided loads in a loop since
 // that can exhaust the HW prefetcher resources.  We adjust the unroller
 // MaxCount preference below to attempt to ensure unrolling doesn't create too
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index f471294ffc25207..b008bffd23587e1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -157,6 +157,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
 
   unsigned getMaxInterleaveFactor(ElementCount VF);
 
+  ElementCount getMaxPredicateLength(ElementCount VF) const;
+
   bool prefersVectorizedAddressing() const;
 
   InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index a7ebf78e54ceb61..0e681c8080bfd10 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -184,6 +184,14 @@ class VPBuilder {
   VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
                       DebugLoc DL = {}, const Twine &Name = "");
 
+  VPValue *createGetActiveLaneMask(VPValue *IV, VPValue *TC, DebugLoc DL,
+                                   const Twine &Name = "") {
+    auto *ALM = new VPActiveLaneMaskRecipe(IV, TC, DL, Name);
+    if (BB)
+      BB->insert(ALM, InsertPt);
+    return ALM;
+  }
+
   //===--------------------------------------------------------------------===//
   // RAII helpers.
   //===--------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 77d54c9d08f4e62..e63cc79dead39c6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -604,6 +604,10 @@ class InnerLoopVectorizer {
   /// count of the original loop for both main loop and epilogue vectorization.
   void setTripCount(Value *TC) { TripCount = TC; }
 
+  ElementCount getMaxPredicateLength(ElementCount VF) const {
+    return TTI->getMaxPredicateLength(VF);
+  }
+
 protected:
   friend class LoopVectorizationPlanner;
 
@@ -7507,7 +7511,8 @@ LoopVectorizationPlanner::executePlan(
     VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
 
   // Perform the actual loop transformation.
-  VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
+  VPTransformState State(BestVF, BestUF, TTI.getMaxPredicateLength(BestVF), LI,
+                         DT, ILV.Builder, &ILV, &BestVPlan,
                          OrigLoop->getHeader()->getContext());
 
   // 0. Generate SCEV-dependent code into the preheader, including TripCount,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 4b4f4911eb6415e..85972f9bef9af7d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -232,15 +232,16 @@ struct VPIteration {
 /// VPTransformState holds information passed down when "executing" a VPlan,
 /// needed for generating the output IR.
 struct VPTransformState {
-  VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
-                   DominatorTree *DT, IRBuilderBase &Builder,
+  VPTransformState(ElementCount VF, unsigned UF, ElementCount MaxPred,
+                   LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder,
                    InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx)
-      : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan),
-        LVer(nullptr), TypeAnalysis(Ctx) {}
+      : VF(VF), UF(UF), MaxPred(MaxPred), LI(LI), DT(DT), Builder(Builder),
+        ILV(ILV), Plan(Plan), LVer(nullptr), TypeAnalysis(Ctx) {}
 
   /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
   ElementCount VF;
   unsigned UF;
+  ElementCount MaxPred;
 
   /// Hold the indices to generate specific scalar instructions. Null indicates
   /// that all instances are to be generated, using either scalar or vector
@@ -1177,7 +1178,6 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
     switch (getOpcode()) {
     default:
       return false;
-    case VPInstruction::ActiveLaneMask:
     case VPInstruction::CalculateTripCountMinusVF:
     case VPInstruction::CanonicalIVIncrementForPart:
     case VPInstruction::BranchOnCount:
@@ -1202,6 +1202,35 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
   }
 };
 
+class VPActiveLaneMaskRecipe : public VPRecipeWithIRFlags, public VPValue {
+  const std::string Name;
+  ElementCount MaxLength;
+
+public:
+  VPActiveLaneMaskRecipe(VPValue *IV, VPValue *TC, // ElementCount MaxLength,
+                         DebugLoc DL = {}, const Twine &Name = "")
+      : VPRecipeWithIRFlags(VPDef::VPActiveLaneMaskSC,
+                            std::initializer_list<VPValue *>{IV, TC}, DL),
+        VPValue(this), Name(Name.str()) {} //, MaxLength(MaxLength) {}
+
+  VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskSC)
+
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+
+    return getOperand(0) == Op;
+  }
+};
+
 /// VPWidenRecipe is a recipe for producing a copy of vector type its
 /// ingredient. This recipe covers most of the traditional vectorization cases
 /// where each ingredient transforms into a vectorized version of itself.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c553e2c9e76839e..e65cef67f1307ae 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -306,18 +306,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
     Value *Op2 = State.get(getOperand(2), Part);
     return Builder.CreateSelect(Cond, Op1, Op2, Name);
   }
-  case VPInstruction::ActiveLaneMask: {
-    // Get first lane of vector induction variable.
-    Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
-    // Get the original loop tripcount.
-    Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
 
-    auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
-    auto *PredTy = VectorType::get(Int1Ty, State.VF);
-    return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
-                                   {PredTy, ScalarTC->getType()},
-                                   {VIVElem0, ScalarTC}, nullptr, Name);
-  }
   case VPInstruction::FirstOrderRecurrenceSplice: {
     // Generate code to combine the previous and current values in vector v3.
     //
@@ -540,9 +529,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::SLPStore:
     O << "combined store";
     break;
-  case VPInstruction::ActiveLaneMask:
-    O << "active lane mask";
-    break;
   case VPInstruction::FirstOrderRecurrenceSplice:
     O << "first-order splice";
     break;
@@ -573,8 +559,78 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
     DL.print(O);
   }
 }
+
+void VPActiveLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
+                                   VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+
+  printAsOperand(O, SlotTracker);
+  O << " = active lane mask";
+  printFlags(O);
+  printOperands(O, SlotTracker);
+
+  if (auto DL = getDebugLoc()) {
+    O << ", !dbg ";
+    DL.print(O);
+  }
+}
+
 #endif
 
+void VPActiveLaneMaskRecipe::execute(VPTransformState &State) {
+  assert(!State.Instance && "VPInstruction executing an Instance");
+
+  IRBuilderBase &Builder = State.Builder;
+  Builder.SetCurrentDebugLocation(getDebugLoc());
+
+  auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
+  auto *PredTy = VectorType::get(Int1Ty, State.VF);
+
+  unsigned MaxPred = std::min(State.MaxPred.getKnownMinValue(),
+                              State.UF * State.VF.getKnownMinValue());
+  if (State.UF <= 1 || MaxPred <= State.VF.getKnownMinValue() ||
+      MaxPred % State.VF.getKnownMinValue() != 0) {
+    for (int Part = State.UF - 1; Part >= 0; --Part) {
+      // Get first lane of vector induction variable.
+      Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
+      // Get the original loop tripcount.
+      Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
+      Value *V = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
+                                         {PredTy, ScalarTC->getType()},
+                                         {VIVElem0, ScalarTC}, nullptr, Name);
+      State.set(this, V, Part);
+    }
+    return;
+  }
+
+  // Generate long active lane masks covering all the unrolled iterations.
+  unsigned PartsPerMask = MaxPred / State.VF.getKnownMinValue();
+  auto *LongPredTy = VectorType::get(Int1Ty, MaxPred, State.VF.isScalable());
+  SmallVector<Value *> LongMask(State.UF / PartsPerMask, nullptr);
+  for (int Part = State.UF - PartsPerMask; Part >= 0; Part -= PartsPerMask) {
+    // Get first lane of vector induction variable.
+    Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
+    // Get the original loop tripcount.
+    Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
+    Value *V = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
+                                       {LongPredTy, ScalarTC->getType()},
+                                       {VIVElem0, ScalarTC}, nullptr, Name);
+    LongMask[Part / PartsPerMask] = V;
+  }
+
+  for (int Part = State.UF - 1; Part >= 0; --Part) {
+    Value *ALM = LongMask[Part / PartsPerMask];
+    const unsigned I = Part % PartsPerMask;
+    Value *V = Builder.CreateIntrinsic(
+        Intrinsic::vector_extract, {PredTy, ALM->getType()},
+        {ALM, ConstantInt::get(Type::getInt64Ty(Builder.getContext()),
+                               I * State.VF.getKnownMinValue())},
+        nullptr, Name);
+
+    State.set(this, V, Part);
+  }
+}
+
 void VPWidenCallRecipe::execute(VPTransformState &State) {
   assert(State.VF.isVector() && "not widening");
   auto &CI = *cast<CallInst>(getUnderlyingInstr());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b3694e74a385099..f9fe9caf9eb6c15 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -560,8 +560,7 @@ static bool canSimplifyBranchOnCond(VPInstruction *Term) {
   if (!Not || Not->getOpcode() != VPInstruction::Not)
     return false;
 
-  VPInstruction *ALM = dyn_cast<VPInstruction>(Not->getOperand(0));
-  return ALM && ALM->getOpcode() == VPInstruction::ActiveLaneMask;
+  return dyn_cast<VPActiveLaneMaskRecipe>(Not->getOperand(0));
 }
 
 void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
@@ -1095,9 +1094,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       "index.part.next");
 
   // Create the active lane mask instruction in the VPlan preheader.
-  auto *EntryALM =
-      Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
-                           DL, "active.lane.mask.entry");
+  auto *EntryALM = Builder.createGetActiveLaneMask(EntryIncrement, TC, DL,
+                                                   "active.lane.mask.entry");
 
   // Now create the ActiveLaneMaskPhi recipe in the main loop using the
   // preheader ActiveLaneMask instruction.
@@ -1111,9 +1109,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
   auto *InLoopIncrement =
       Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
                                   {IncrementValue}, {false, false}, DL);
-  auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
-                                   {InLoopIncrement, TripCount}, DL,
-                                   "active.lane.mask.next");
+  auto *ALM = Builder.createGetActiveLaneMask(InLoopIncrement, TripCount, DL,
+                                              "active.lane.mask.next");
   LaneMaskPhi->addOperand(ALM);
 
   // Replace the original terminator with BranchOnCond. We have to invert the
@@ -1144,9 +1141,8 @@ void VPlanTransforms::addActiveLaneMask(
     LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
         Plan, DataAndControlFlowWithoutRuntimeCheck);
   } else {
-    LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask,
-                                 {WideCanonicalIV, Plan.getTripCount()},
-                                 nullptr, "active.lane.mask");
+    LaneMask = new VPActiveLaneMaskRecipe(WideCanonicalIV, Plan.getTripCount(),
+                                          nullptr, "active.lane.mask");
     LaneMask->insertAfter(WideCanonicalIV);
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 8cc98f4abf933e0..b5b96cd269a6055 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -359,6 +359,7 @@ class VPDef {
     VPWidenMemoryInstructionSC,
     VPWidenSC,
     VPWidenSelectSC,
+    VPActiveLaneMaskSC,
     // START: Phi-like recipes. Need to be kept together.
     VPBlendSC,
     VPPredInstPHISC,
diff --git a/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll b/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll
new file mode 100644
index 000000000000000..430b349291d1a31
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll
@@ -0,0 +1,1015 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux"
+
+; Byte elements, UF=0
+define void @f0(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
+; CHECK-LABEL: f0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w2, #1
+; CHECK-NEXT:    b.lt .LBB0_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    mov w9, w2
+; CHECK-NEXT:    rdvl x10, #1
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    whilelo p0.b, xzr, x9
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x1, x8]
+; CHECK-NEXT:    mul z0.b, z0.b, #3
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; CHECK-NEXT:    add x8, x8, x10
+; CHECK-NEXT:    whilelo p0.b, x8, x9
+; CHECK-NEXT:    b.mi .LBB0_2
+; CHECK-NEXT:  .LBB0_3: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 4
+  %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %2 = getelementptr inbounds i8, ptr %src, i64 %index
+  %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %2, i32 1, <vscale x 16 x i1> %active.lane.mask, <vscale x 16 x i8> poison)
+  %3 = mul <vscale x 16 x i8> %wide.masked.load, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %4 = getelementptr inbounds i8, ptr %dst, i64 %index
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %3, ptr %4, i32 1, <vscale x 16 x i1> %active.lane.mask)
+  %index.next = add i64 %index, %1
+  %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %5 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
+  br i1 %5, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; Double-precision float elements, UF=0
+define void @f1(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
+; CHECK-LABEL: f1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w2, #1
+; CHECK-NEXT:    b.lt .LBB1_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    mov w9, w2
+; CHECK-NEXT:    fmov z0.d, #3.00000000
+; CHECK-NEXT:    cntd x10
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    whilelo p0.d, xzr, x9
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB1_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
+; CHECK-NEXT:    fmul z1.d, z1.d, z0.d
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
+; CHECK-NEXT:    add x8, x8, x10
+; CHECK-NEXT:    whilelo p0.d, x8, x9
+; CHECK-NEXT:    b.mi .LBB1_2
+; CHECK-NEXT:  .LBB1_3: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 1
+  %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 2 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %2 = getelementptr inbounds double, ptr %src, i64 %index
+  %wide.masked.load = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr %2, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x double> poison)
+  %3 = fmul <vscale x 2 x double> %wide.masked.load, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %4 = getelementptr inbounds double, ptr %dst, i64 %index
+  tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %3, ptr %4, i32 8, <vscale x 2 x i1> %active.lane.mask)
+  %index.next = add i64 %index, %1
+  %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %5 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0
+  br i1 %5, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; Byte elements, UF=2
+define void @f2(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
+; CHECK-LABEL: f2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w2, #1
+; CHECK-NEXT:    b.lt .LBB2_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    rdvl x11, #1
+; CHECK-NEXT:    mov w8, w2
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    add x10, x0, x11
+; CHECK-NEXT:    whilelo p0.b, x11, x8
+; CHECK-NEXT:    add x11, x1, x11
+; CHECK-NEXT:    rdvl x12, #2
+; CHECK-NEXT:    rdvl x13, #3
+; CHECK-NEXT:    whilelo p1.b, xzr, x8
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB2_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x1, x9]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x11, x9]
+; CHECK-NEXT:    add x14, x13, x9
+; CHECK-NEXT:    mul z0.b, z0.b, #3
+; CHECK-NEXT:    mul z1.b, z1.b, #3
+; CHECK-NEXT:    st1b { z0.b }, p1, [x0, x9]
+; CHECK-NEXT:    st1b { z1.b }, p0, [x10, x9]
+; CHECK-NEXT:    add x9, x12, x9
+; CHECK-NEXT:    whilelo p0.b, x14, x8
+; CHECK-NEXT:    whilelo p1.b, x9, x8
+; CHECK-NEXT:    b.mi .LBB2_2
+; CHECK-NEXT:  .LBB2_3: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 5
+  %2 = tail call i64 @llvm.vscale.i64()
+  %3 = shl nuw nsw i64 %2, 4
+  %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %3, i64 %wide.trip.count)
+  %active.lane.mask.entry1 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
+  %4 = tail call i64 @llvm.vscale.i64()
+  %5 = shl nuw nsw i64 %4, 4
+  %6 = tail call i64 @llvm.vscale.i64()
+  %7 = shl nuw nsw i64 %6, 4
+  %8 = tail call i64 @llvm.vscale.i64()
+  %9 = shl nuw nsw i64 %8, 4
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next4, %vector.body ]
+  %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry1, %for.body.preheader ], [ %active.lane.mask.next5, %vector.body ]
+  %active.lane.mask2 = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %10 = getelementptr inbounds i8, ptr %src, i64 %index
+  %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %10, i32 1, <vscale x 16 x i1> %active.lane.mask, <vscale x 16 x i8> poison)
+  %11 = getelementptr inbounds i8, ptr %10, i64 %5
+  %wide.masked.load3 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %11, i32 1, <vscale x 16 x i1> %active.lane.mask2, <vscale x 16 x i8> poison)
+  %12 = mul <vscale x 16 x i8> %wide.masked.load, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %13 = mul <vscale x 16 x i8> %wide.masked.load3, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %14 = getelementptr inbounds i8, ptr %dst, i64 %index
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %12, ptr %14, i32 1, <vscale x 16 x i1> %active.lane.mask)
+  %15 = getelementptr inbounds i8, ptr %14, i64 %7
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %13, ptr %15, i32 1, <vscale x 16 x i1> %active.lane.mask2)
+  %index.next = add i64 %index, %1
+  %index.next4 = add i64 %index, %1
+  %16 = add i64 %index.next, %9
+  %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %16, i64 %wide.trip.count)
+  %active.lane.mask.next5 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %17 = extractelement <vscale x 16 x i1> %active.lane.mask.next5, i64 0
+  br i1 %17, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; Double-precision float elements, UF=2
+define void @f3(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
+; CHECK-LABEL: f3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w2, #1
+; CHECK-NEXT:    b.lt .LBB3_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    mov w9, w2
+; CHECK-NEXT:    rdvl x11, #1
+; CHECK-NEXT:    fmov z0.d, #3.00000000
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add x10, x0, x11
+; CHECK-NEXT:    whilelo p1.s, xzr, x9
+; CHECK-NEXT:    add x11, x1, x11
+; CHECK-NEXT:    cntw x12
+; CHECK-NEXT:    punpkhi p0.h, p1.b
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB3_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x11, x8, lsl #3]
+; CHECK-NEXT:    fmul z1.d, z1.d, z0.d
+; CHECK-NEXT:    fmul z2.d, z2.d, z0.d
+; CHECK-NEXT:    st1d { z1.d }, p1, [x0, x8, lsl #3]
+; CHECK-NEXT:    st1d { z2.d }, p0, [x10, x8, lsl #3]
+; CHECK-NEXT:    add x8, x12, x8
+; CHECK-NEXT:    whilelo p1.s, x8, x9
+; CHECK-NEXT:    punpkhi p0.h, p1.b
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    mov z1.d, p1/z, #1 // =0x1
+; CHECK-NEXT:    fmov x13, d1
+; CHECK-NEXT:    tbnz w13, #0, .LBB3_2
+; CHECK-NEXT:  .LBB3_3: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 2
+  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
+  %active.lane.mask.entry1 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.entry, i64 2)
+  %active.lane.mask.entry2 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.entry, i64 0)
+  %2 = tail call i64 @llvm.vscale.i64()
+  %3 = shl nuw nsw i64 %2, 1
+  %4 = tail call i64 @llvm.vscale.i64()
+  %5 = shl nuw nsw i64 %4, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next5, %vector.body ]
+  %active.lane.mask = phi <vscale x 2 x i1> [ %active.lane.mask.entry2, %for.body.preheader ], [ %active.lane.mask.next7, %vector.body ]
+  %active.lane.mask3 = phi <vscale x 2 x i1> [ %active.lane.mask.entry1, %for.body.preheader ], [ %active.lane.mask.next6, %vector.body ]
+  %6 = getelementptr inbounds double, ptr %src, i64 %index
+  %wide.masked.load = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr %6, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x double> poison)
+  %7 = getelementptr inbounds double, ptr %6, i64 %3
+  %wide.masked.load4 = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull %7, i32 8, <vscale x 2 x i1> %active.lane.mask3, <vscale x 2 x double> poison)
+  %8 = fmul <vscale x 2 x double> %wide.masked.load, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %9 = fmul <vscale x 2 x double> %wide.masked.load4, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %10 = getelementptr inbounds double, ptr %dst, i64 %index
+  tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %8, ptr %10, i32 8, <vscale x 2 x i1> %active.lane.mask)
+  %11 = getelementptr inbounds double, ptr %10, i64 %5
+  tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %9, ptr %11, i32 8, <vscale x 2 x i1> %active.lane.mask3)
+  %index.next = add i64 %index, %1
+  %index.next5 = add i64 %index, %1
+  %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %active.lane.mask.next6 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.next, i64 2)
+  %active.lane.mask.next7 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.next, i64 0)
+  %12 = extractelement <vscale x 2 x i1> %active.lane.mask.next7, i64 0
+  br i1 %12, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; Byte elements, UF=4
+define void @f4(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
+; CHECK-LABEL: f4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w2, #1
+; CHECK-NEXT:    b.lt .LBB4_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    rdvl x14, #2
+; CHECK-NEXT:    mov w8, w2
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    add x11, x0, x14
+; CHECK-NEXT:    rdvl x13, #3
+; CHECK-NEXT:    rdvl x15, #1
+; CHECK-NEXT:    add x10, x0, x13
+; CHECK-NEXT:    add x12, x0, x15
+; CHECK-NEXT:    rdvl x16, #4
+; CHECK-NEXT:    rdvl x17, #5
+; CHECK-NEXT:    rdvl x18, #6
+; CHECK-NEXT:    rdvl x2, #7
+; CHECK-NEXT:    whilelo p0.b, x13, x8
+; CHECK-NEXT:    add x13, x1, x13
+; CHECK-NEXT:    whilelo p1.b, x14, x8
+; CHECK-NEXT:    add x14, x1, x14
+; CHECK-NEXT:    whilelo p2.b, x15, x8
+; CHECK-NEXT:    add x15, x1, x15
+; CHECK-NEXT:    whilelo p3.b, xzr, x8
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB4_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1b { z0.b }, p3/z, [x1, x9]
+; CHECK-NEXT:    ld1b { z1.b }, p2/z, [x15, x9]
+; CHECK-NEXT:    add x3, x2, x9
+; CHECK-NEXT:    mul z0.b, z0.b, #3
+; CHECK-NEXT:    ld1b { z2.b }, p1/z, [x14, x9]
+; CHECK-NEXT:    ld1b { z3.b }, p0/z, [x13, x9]
+; CHECK-NEXT:    mul z1.b, z1.b, #3
+; CHECK-NEXT:    mul z2.b, z2.b, #3
+; CHECK-NEXT:    mul z3.b, z3.b, #3
+; CHECK-NEXT:    st1b { z0.b }, p3, [x0, x9]
+; CHECK-NEXT:    st1b { z1.b }, p2, [x12, x9]
+; CHECK-NEXT:    st1b { z2.b }, p1, [x11, x9]
+; CHECK-NEXT:    st1b { z3.b }, p0, [x10, x9]
+; CHECK-NEXT:    whilelo p0.b, x3, x8
+; CHECK-NEXT:    add x3, x18, x9
+; CHECK-NEXT:    whilelo p1.b, x3, x8
+; CHECK-NEXT:    add x3, x17, x9
+; CHECK-NEXT:    add x9, x16, x9
+; CHECK-NEXT:    whilelo p2.b, x3, x8
+; CHECK-NEXT:    whilelo p3.b, x9, x8
+; CHECK-NEXT:    b.mi .LBB4_2
+; CHECK-NEXT:  .LBB4_3: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 6
+  %2 = tail call i64 @llvm.vscale.i64()
+  %3 = shl nuw nsw i64 %2, 4
+  %4 = tail call i64 @llvm.vscale.i64()
+  %5 = shl nuw nsw i64 %4, 5
+  %6 = tail call i64 @llvm.vscale.i64()
+  %7 = mul nuw nsw i64 %6, 48
+  %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %7, i64 %wide.trip.count)
+  %active.lane.mask.entry3 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %5, i64 %wide.trip.count)
+  %active.lane.mask.entry4 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %3, i64 %wide.trip.count)
+  %active.lane.mask.entry5 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
+  %8 = tail call i64 @llvm.vscale.i64()
+  %9 = shl nuw nsw i64 %8, 4
+  %10 = tail call i64 @llvm.vscale.i64()
+  %11 = shl nuw nsw i64 %10, 5
+  %12 = tail call i64 @llvm.vscale.i64()
+  %13 = mul nuw nsw i64 %12, 48
+  %14 = tail call i64 @llvm.vscale.i64()
+  %15 = shl nuw nsw i64 %14, 4
+  %16 = tail call i64 @llvm.vscale.i64()
+  %17 = shl nuw nsw i64 %16, 5
+  %18 = tail call i64 @llvm.vscale.i64()
+  %19 = mul nuw nsw i64 %18, 48
+  %20 = tail call i64 @llvm.vscale.i64()
+  %21 = shl nuw nsw i64 %20, 4
+  %22 = tail call i64 @llvm.vscale.i64()
+  %23 = shl nuw nsw i64 %22, 5
+  %24 = tail call i64 @llvm.vscale.i64()
+  %25 = mul nuw nsw i64 %24, 48
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next14, %vector.body ]
+  %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry5, %for.body.preheader ], [ %active.lane.mask.next17, %vector.body ]
+  %active.lane.mask6 = phi <vscale x 16 x i1> [ %active.lane.mask.entry4, %for.body.preheader ], [ %active.lane.mask.next16, %vector.body ]
+  %active.lane.mask7 = phi <vscale x 16 x i1> [ %active.lane.mask.entry3, %for.body.preheader ], [ %active.lane.mask.next15, %vector.body ]
+  %active.lane.mask8 = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %26 = getelementptr inbounds i8, ptr %src, i64 %index
+  %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %26, i32 1, <vscale x 16 x i1> %active.lane.mask, <vscale x 16 x i8> poison)
+  %27 = getelementptr inbounds i8, ptr %26, i64 %9
+  %wide.masked.load9 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %27, i32 1, <vscale x 16 x i1> %active.lane.mask6, <vscale x 16 x i8> poison)
+  %28 = getelementptr inbounds i8, ptr %26, i64 %11
+  %wide.masked.load10 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %28, i32 1, <vscale x 16 x i1> %active.lane.mask7, <vscale x 16 x i8> poison)
+  %29 = getelementptr inbounds i8, ptr %26, i64 %13
+  %wide.masked.load11 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %29, i32 1, <vscale x 16 x i1> %active.lane.mask8, <vscale x 16 x i8> poison)
+  %30 = mul <vscale x 16 x i8> %wide.masked.load, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %31 = mul <vscale x 16 x i8> %wide.masked.load9, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %32 = mul <vscale x 16 x i8> %wide.masked.load10, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %33 = mul <vscale x 16 x i8> %wide.masked.load11, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %34 = getelementptr inbounds i8, ptr %dst, i64 %index
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %30, ptr %34, i32 1, <vscale x 16 x i1> %active.lane.mask)
+  %35 = getelementptr inbounds i8, ptr %34, i64 %15
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %31, ptr %35, i32 1, <vscale x 16 x i1> %active.lane.mask6)
+  %36 = getelementptr inbounds i8, ptr %34, i64 %17
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %32, ptr %36, i32 1, <vscale x 16 x i1> %active.lane.mask7)
+  %37 = getelementptr inbounds i8, ptr %34, i64 %19
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %33, ptr %37, i32 1, <vscale x 16 x i1> %active.lane.mask8)
+  %index.next = add i64 %index, %1
+  %index.next14 = add i64 %index, %1
+  %38 = add i64 %index.next, %21
+  %39 = add i64 %index.next, %23
+  %40 = add i64 %index.next, %25
+  %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %40, i64 %wide.trip.count)
+  %active.lane.mask.next15 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %39, i64 %wide.trip.count)
+  %active.lane.mask.next16 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %38, i64 %wide.trip.count)
+  %active.lane.mask.next17 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %41 = extractelement <vscale x 16 x i1> %active.lane.mask.next17, i64 0
+  br i1 %41, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; Double-precision float elements, UF=4
+define void @f5(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
+; CHECK-LABEL: f5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w2, #1
+; CHECK-NEXT:    b.lt .LBB5_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    cntw x10
+; CHECK-NEXT:    mov w8, w2
+; CHECK-NEXT:    fmov z0.d, #3.00000000
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    rdvl x13, #3
+; CHECK-NEXT:    rdvl x14, #2
+; CHECK-NEXT:    add x11, x0, x14
+; CHECK-NEXT:    add x14, x1, x14
+; CHECK-NEXT:    rdvl x15, #1
+; CHECK-NEXT:    add x12, x0, x15
+; CHECK-NEXT:    add x15, x1, x15
+; CHECK-NEXT:    cnth x16
+; CHECK-NEXT:    cntw x17, all, mul #3
+; CHECK-NEXT:    whilelo p1.s, x10, x8
+; CHECK-NEXT:    add x10, x0, x13
+; CHECK-NEXT:    add x13, x1, x13
+; CHECK-NEXT:    punpkhi p0.h, p1.b
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    whilelo p3.s, xzr, x8
+; CHECK-NEXT:    punpkhi p2.h, p3.b
+; CHECK-NEXT:    punpklo p3.h, p3.b
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB5_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1d { z1.d }, p3/z, [x1, x9, lsl #3]
+; CHECK-NEXT:    ld1d { z2.d }, p2/z, [x15, x9, lsl #3]
+; CHECK-NEXT:    add x18, x17, x9
+; CHECK-NEXT:    fmul z1.d, z1.d, z0.d
+; CHECK-NEXT:    ld1d { z3.d }, p1/z, [x14, x9, lsl #3]
+; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x13, x9, lsl #3]
+; CHECK-NEXT:    fmul z2.d, z2.d, z0.d
+; CHECK-NEXT:    fmul z3.d, z3.d, z0.d
+; CHECK-NEXT:    fmul z4.d, z4.d, z0.d
+; CHECK-NEXT:    st1d { z1.d }, p3, [x0, x9, lsl #3]
+; CHECK-NEXT:    st1d { z2.d }, p2, [x12, x9, lsl #3]
+; CHECK-NEXT:    st1d { z3.d }, p1, [x11, x9, lsl #3]
+; CHECK-NEXT:    st1d { z4.d }, p0, [x10, x9, lsl #3]
+; CHECK-NEXT:    add x9, x16, x9
+; CHECK-NEXT:    whilelo p1.s, x18, x8
+; CHECK-NEXT:    whilelo p3.s, x9, x8
+; CHECK-NEXT:    punpkhi p0.h, p1.b
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    punpkhi p2.h, p3.b
+; CHECK-NEXT:    punpklo p3.h, p3.b
+; CHECK-NEXT:    mov z1.d, p3/z, #1 // =0x1
+; CHECK-NEXT:    fmov x18, d1
+; CHECK-NEXT:    tbnz w18, #0, .LBB5_2
+; CHECK-NEXT:  .LBB5_3: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 3
+  %2 = tail call i64 @llvm.vscale.i64()
+  %3 = shl nuw nsw i64 %2, 2
+  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %3, i64 %wide.trip.count)
+  %active.lane.mask.entry3 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
+  %active.lane.mask.entry4 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.entry, i64 2)
+  %active.lane.mask.entry5 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.entry, i64 0)
+  %active.lane.mask.entry6 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.entry3, i64 2)
+  %active.lane.mask.entry7 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.entry3, i64 0)
+  %4 = tail call i64 @llvm.vscale.i64()
+  %5 = shl nuw nsw i64 %4, 1
+  %6 = tail call i64 @llvm.vscale.i64()
+  %7 = shl nuw nsw i64 %6, 2
+  %8 = tail call i64 @llvm.vscale.i64()
+  %9 = mul nuw nsw i64 %8, 6
+  %10 = tail call i64 @llvm.vscale.i64()
+  %11 = shl nuw nsw i64 %10, 1
+  %12 = tail call i64 @llvm.vscale.i64()
+  %13 = shl nuw nsw i64 %12, 2
+  %14 = tail call i64 @llvm.vscale.i64()
+  %15 = mul nuw nsw i64 %14, 6
+  %16 = tail call i64 @llvm.vscale.i64()
+  %17 = shl nuw nsw i64 %16, 2
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next16, %vector.body ]
+  %active.lane.mask = phi <vscale x 2 x i1> [ %active.lane.mask.entry7, %for.body.preheader ], [ %active.lane.mask.next21, %vector.body ]
+  %active.lane.mask8 = phi <vscale x 2 x i1> [ %active.lane.mask.entry6, %for.body.preheader ], [ %active.lane.mask.next20, %vector.body ]
+  %active.lane.mask9 = phi <vscale x 2 x i1> [ %active.lane.mask.entry5, %for.body.preheader ], [ %active.lane.mask.next19, %vector.body ]
+  %active.lane.mask10 = phi <vscale x 2 x i1> [ %active.lane.mask.entry4, %for.body.preheader ], [ %active.lane.mask.next18, %vector.body ]
+  %18 = getelementptr inbounds double, ptr %src, i64 %index
+  %wide.masked.load = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr %18, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x double> poison)
+  %19 = getelementptr inbounds double, ptr %18, i64 %5
+  %wide.masked.load11 = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull %19, i32 8, <vscale x 2 x i1> %active.lane.mask8, <vscale x 2 x double> poison)
+  %20 = getelementptr inbounds double, ptr %18, i64 %7
+  %wide.masked.load12 = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull %20, i32 8, <vscale x 2 x i1> %active.lane.mask9, <vscale x 2 x double> poison)
+  %21 = getelementptr inbounds double, ptr %18, i64 %9
+  %wide.masked.load13 = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull %21, i32 8, <vscale x 2 x i1> %active.lane.mask10, <vscale x 2 x double> poison)
+  %22 = fmul <vscale x 2 x double> %wide.masked.load, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %23 = fmul <vscale x 2 x double> %wide.masked.load11, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %24 = fmul <vscale x 2 x double> %wide.masked.load12, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %25 = fmul <vscale x 2 x double> %wide.masked.load13, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %26 = getelementptr inbounds double, ptr %dst, i64 %index
+  tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %22, ptr %26, i32 8, <vscale x 2 x i1> %active.lane.mask)
+  %27 = getelementptr inbounds double, ptr %26, i64 %11
+  tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %23, ptr %27, i32 8, <vscale x 2 x i1> %active.lane.mask8)
+  %28 = getelementptr inbounds double, ptr %26, i64 %13
+  tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %24, ptr %28, i32 8, <vscale x 2 x i1> %active.lane.mask9)
+  %29 = getelementptr inbounds double, ptr %26, i64 %15
+  tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %25, ptr %29, i32 8, <vscale x 2 x i1> %active.lane.mask10)
+  %index.next = add i64 %index, %1
+  %index.next16 = add i64 %index, %1
+  %30 = add i64 %index.next, %17
+  %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %30, i64 %wide.trip.count)
+  %active.lane.mask.next17 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %active.lane.mask.next18 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.next, i64 2)
+  %active.lane.mask.next19 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.next, i64 0)
+  %active.lane.mask.next20 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.next17, i64 2)
+  %active.lane.mask.next21 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.next17, i64 0)
+  %31 = extractelement <vscale x 2 x i1> %active.lane.mask.next21, i64 0
+  br i1 %31, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; SVE 2.1, byte elements, UF=0
+define void @f6(ptr noalias %dst, ptr readonly %src, i32 %n) #1 {
+; CHECK-LABEL: f6:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w2, #1
+; CHECK-NEXT:    b.lt .LBB6_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    mov w9, w2
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    whilelo p0.b, xzr, x9
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB6_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x1, x8]
+; CHECK-NEXT:    mul z0.b, z0.b, #3
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; CHECK-NEXT:    addvl x8, x8, #1
+; CHECK-NEXT:    whilelo p0.b, x8, x9
+; CHECK-NEXT:    b.mi .LBB6_2
+; CHECK-NEXT:  .LBB6_3: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 4
+  %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %2 = getelementptr inbounds i8, ptr %src, i64 %index
+  %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %2, i32 1, <vscale x 16 x i1> %active.lane.mask, <vscale x 16 x i8> poison)
+  %3 = mul <vscale x 16 x i8> %wide.masked.load, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %4 = getelementptr inbounds i8, ptr %dst, i64 %index
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %3, ptr %4, i32 1, <vscale x 16 x i1> %active.lane.mask)
+  %index.next = add i64 %index, %1
+  %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %5 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
+  br i1 %5, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; SVE2.1, double-precision float elements, UF=0
+define void @f7(ptr noalias %dst, ptr readonly %src, i32 %n) #1 {
+; CHECK-LABEL: f7:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w2, #1
+; CHECK-NEXT:    b.lt .LBB7_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    mov w9, w2
+; CHECK-NEXT:    fmov z0.d, #3.00000000
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    whilelo p0.d, xzr, x9
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB7_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
+; CHECK-NEXT:    fmul z1.d, z1.d, z0.d
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
+; CHECK-NEXT:    incd x8
+; CHECK-NEXT:    whilelo p0.d, x8, x9
+; CHECK-NEXT:    b.mi .LBB7_2
+; CHECK-NEXT:  .LBB7_3: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 1
+  %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 2 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %2 = getelementptr inbounds double, ptr %src, i64 %index
+  %wide.masked.load = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr %2, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x double> poison)
+  %3 = fmul <vscale x 2 x double> %wide.masked.load, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %4 = getelementptr inbounds double, ptr %dst, i64 %index
+  tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %3, ptr %4, i32 8, <vscale x 2 x i1> %active.lane.mask)
+  %index.next = add i64 %index, %1
+  %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %5 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0
+  br i1 %5, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; SVE 2.1, byte elements, UF=2
+define void @f8(ptr noalias %dst, ptr readonly %src, i32 %n) #1 {
+; CHECK-LABEL: f8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w2, #1
+; CHECK-NEXT:    b.lt .LBB8_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    mov w9, w2
+; CHECK-NEXT:    addvl x10, x0, #1
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    whilelo { p0.b, p1.b }, xzr, x9
+; CHECK-NEXT:    addvl x11, x1, #1
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB8_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x1, x8]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x11, x8]
+; CHECK-NEXT:    mul z0.b, z0.b, #3
+; CHECK-NEXT:    mul z1.b, z1.b, #3
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; CHECK-NEXT:    st1b { z1.b }, p1, [x10, x8]
+; CHECK-NEXT:    addvl x8, x8, #2
+; CHECK-NEXT:    whilelo { p0.b, p1.b }, x8, x9
+; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    fmov w12, s0
+; CHECK-NEXT:    tbnz w12, #0, .LBB8_2
+; CHECK-NEXT:  .LBB8_3: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 5
+  %active.lane.mask.entry = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 %wide.trip.count)
+  %active.lane.mask.entry1 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.entry, i64 16)
+  %active.lane.mask.entry2 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.entry, i64 0)
+  %2 = tail call i64 @llvm.vscale.i64()
+  %3 = shl nuw nsw i64 %2, 4
+  %4 = tail call i64 @llvm.vscale.i64()
+  %5 = shl nuw nsw i64 %4, 4
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next5, %vector.body ]
+  %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry2, %for.body.preheader ], [ %active.lane.mask.next7, %vector.body ]
+  %active.lane.mask3 = phi <vscale x 16 x i1> [ %active.lane.mask.entry1, %for.body.preheader ], [ %active.lane.mask.next6, %vector.body ]
+  %6 = getelementptr inbounds i8, ptr %src, i64 %index
+  %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %6, i32 1, <vscale x 16 x i1> %active.lane.mask, <vscale x 16 x i8> poison)
+  %7 = getelementptr inbounds i8, ptr %6, i64 %3
+  %wide.masked.load4 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %7, i32 1, <vscale x 16 x i1> %active.lane.mask3, <vscale x 16 x i8> poison)
+  %8 = mul <vscale x 16 x i8> %wide.masked.load, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %9 = mul <vscale x 16 x i8> %wide.masked.load4, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %10 = getelementptr inbounds i8, ptr %dst, i64 %index
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %8, ptr %10, i32 1, <vscale x 16 x i1> %active.lane.mask)
+  %11 = getelementptr inbounds i8, ptr %10, i64 %5
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %9, ptr %11, i32 1, <vscale x 16 x i1> %active.lane.mask3)
+  %index.next = add i64 %index, %1
+  %index.next5 = add i64 %index, %1
+  %active.lane.mask.next = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %active.lane.mask.next6 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.next, i64 16)
+  %active.lane.mask.next7 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.next, i64 0)
+  %12 = extractelement <vscale x 16 x i1> %active.lane.mask.next7, i64 0
+  br i1 %12, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; SVE2.1, double-precision float elements, UF=2
+define void @f9(ptr noalias %dst, ptr readonly %src, i32 %n) #1 {
+; CHECK-LABEL: f9:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w2, #1
+; CHECK-NEXT:    b.lt .LBB9_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    mov w9, w2
+; CHECK-NEXT:    addvl x10, x0, #1
+; CHECK-NEXT:    fmov z0.d, #3.00000000
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    whilelo { p0.d, p1.d }, xzr, x9
+; CHECK-NEXT:    addvl x11, x1, #1
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB9_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
+; CHECK-NEXT:    ld1d { z2.d }, p1/z, [x11, x8, lsl #3]
+; CHECK-NEXT:    fmul z1.d, z1.d, z0.d
+; CHECK-NEXT:    fmul z2.d, z2.d, z0.d
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
+; CHECK-NEXT:    st1d { z2.d }, p1, [x10, x8, lsl #3]
+; CHECK-NEXT:    incw x8
+; CHECK-NEXT:    whilelo { p0.d, p1.d }, x8, x9
+; CHECK-NEXT:    mov z1.d, p0/z, #1 // =0x1
+; CHECK-NEXT:    fmov x12, d1
+; CHECK-NEXT:    tbnz w12, #0, .LBB9_2
+; CHECK-NEXT:  .LBB9_3: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 2
+  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
+  %active.lane.mask.entry1 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.entry, i64 2)
+  %active.lane.mask.entry2 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.entry, i64 0)
+  %2 = tail call i64 @llvm.vscale.i64()
+  %3 = shl nuw nsw i64 %2, 1
+  %4 = tail call i64 @llvm.vscale.i64()
+  %5 = shl nuw nsw i64 %4, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next5, %vector.body ]
+  %active.lane.mask = phi <vscale x 2 x i1> [ %active.lane.mask.entry2, %for.body.preheader ], [ %active.lane.mask.next7, %vector.body ]
+  %active.lane.mask3 = phi <vscale x 2 x i1> [ %active.lane.mask.entry1, %for.body.preheader ], [ %active.lane.mask.next6, %vector.body ]
+  %6 = getelementptr inbounds double, ptr %src, i64 %index
+  %wide.masked.load = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr %6, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x double> poison)
+  %7 = getelementptr inbounds double, ptr %6, i64 %3
+  %wide.masked.load4 = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull %7, i32 8, <vscale x 2 x i1> %active.lane.mask3, <vscale x 2 x double> poison)
+  %8 = fmul <vscale x 2 x double> %wide.masked.load, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %9 = fmul <vscale x 2 x double> %wide.masked.load4, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %10 = getelementptr inbounds double, ptr %dst, i64 %index
+  tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %8, ptr %10, i32 8, <vscale x 2 x i1> %active.lane.mask)
+  %11 = getelementptr inbounds double, ptr %10, i64 %5
+  tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %9, ptr %11, i32 8, <vscale x 2 x i1> %active.lane.mask3)
+  %index.next = add i64 %index, %1
+  %index.next5 = add i64 %index, %1
+  %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %active.lane.mask.next6 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.next, i64 2)
+  %active.lane.mask.next7 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.next, i64 0)
+  %12 = extractelement <vscale x 2 x i1> %active.lane.mask.next7, i64 0
+  br i1 %12, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; SVE 2.1, byte elements, UF=4
+define void @f10(ptr noalias %dst, ptr readonly %src, i32 %n) #1 {
+; CHECK-LABEL: f10:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w2, #1
+; CHECK-NEXT:    b.lt .LBB10_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    rdvl x10, #2
+; CHECK-NEXT:    mov w8, w2
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    whilelo { p2.b, p3.b }, xzr, x8
+; CHECK-NEXT:    addvl x11, x0, #2
+; CHECK-NEXT:    whilelo { p0.b, p1.b }, x10, x8
+; CHECK-NEXT:    addvl x10, x0, #3
+; CHECK-NEXT:    addvl x12, x0, #1
+; CHECK-NEXT:    addvl x13, x1, #3
+; CHECK-NEXT:    addvl x14, x1, #2
+; CHECK-NEXT:    addvl x15, x1, #1
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB10_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1b { z0.b }, p2/z, [x1, x9]
+; CHECK-NEXT:    ld1b { z1.b }, p3/z, [x15, x9]
+; CHECK-NEXT:    addvl x16, x9, #6
+; CHECK-NEXT:    mul z0.b, z0.b, #3
+; CHECK-NEXT:    ld1b { z2.b }, p0/z, [x14, x9]
+; CHECK-NEXT:    ld1b { z3.b }, p1/z, [x13, x9]
+; CHECK-NEXT:    mul z1.b, z1.b, #3
+; CHECK-NEXT:    mul z2.b, z2.b, #3
+; CHECK-NEXT:    mul z3.b, z3.b, #3
+; CHECK-NEXT:    st1b { z0.b }, p2, [x0, x9]
+; CHECK-NEXT:    st1b { z1.b }, p3, [x12, x9]
+; CHECK-NEXT:    st1b { z2.b }, p0, [x11, x9]
+; CHECK-NEXT:    st1b { z3.b }, p1, [x10, x9]
+; CHECK-NEXT:    addvl x9, x9, #4
+; CHECK-NEXT:    whilelo { p0.b, p1.b }, x16, x8
+; CHECK-NEXT:    whilelo { p2.b, p3.b }, x9, x8
+; CHECK-NEXT:    mov z0.b, p2/z, #1 // =0x1
+; CHECK-NEXT:    fmov w16, s0
+; CHECK-NEXT:    tbnz w16, #0, .LBB10_2
+; CHECK-NEXT:  .LBB10_3: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 6
+  %2 = tail call i64 @llvm.vscale.i64()
+  %3 = shl nuw nsw i64 %2, 5
+  %active.lane.mask.entry = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 %3, i64 %wide.trip.count)
+  %active.lane.mask.entry3 = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 %wide.trip.count)
+  %active.lane.mask.entry4 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.entry, i64 16)
+  %active.lane.mask.entry5 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.entry, i64 0)
+  %active.lane.mask.entry6 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.entry3, i64 16)
+  %active.lane.mask.entry7 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.entry3, i64 0)
+  %4 = tail call i64 @llvm.vscale.i64()
+  %5 = shl nuw nsw i64 %4, 4
+  %6 = tail call i64 @llvm.vscale.i64()
+  %7 = shl nuw nsw i64 %6, 5
+  %8 = tail call i64 @llvm.vscale.i64()
+  %9 = mul nuw nsw i64 %8, 48
+  %10 = tail call i64 @llvm.vscale.i64()
+  %11 = shl nuw nsw i64 %10, 4
+  %12 = tail call i64 @llvm.vscale.i64()
+  %13 = shl nuw nsw i64 %12, 5
+  %14 = tail call i64 @llvm.vscale.i64()
+  %15 = mul nuw nsw i64 %14, 48
+  %16 = tail call i64 @llvm.vscale.i64()
+  %17 = shl nuw nsw i64 %16, 5
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next16, %vector.body ]
+  %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry7, %for.body.preheader ], [ %active.lane.mask.next21, %vector.body ]
+  %active.lane.mask8 = phi <vscale x 16 x i1> [ %active.lane.mask.entry6, %for.body.preheader ], [ %active.lane.mask.next20, %vector.body ]
+  %active.lane.mask9 = phi <vscale x 16 x i1> [ %active.lane.mask.entry5, %for.body.preheader ], [ %active.lane.mask.next19, %vector.body ]
+  %active.lane.mask10 = phi <vscale x 16 x i1> [ %active.lane.mask.entry4, %for.body.preheader ], [ %active.lane.mask.next18, %vector.body ]
+  %18 = getelementptr inbounds i8, ptr %src, i64 %index
+  %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %18, i32 1, <vscale x 16 x i1> %active.lane.mask, <vscale x 16 x i8> poison)
+  %19 = getelementptr inbounds i8, ptr %18, i64 %5
+  %wide.masked.load11 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %19, i32 1, <vscale x 16 x i1> %active.lane.mask8, <vscale x 16 x i8> poison)
+  %20 = getelementptr inbounds i8, ptr %18, i64 %7
+  %wide.masked.load12 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %20, i32 1, <vscale x 16 x i1> %active.lane.mask9, <vscale x 16 x i8> poison)
+  %21 = getelementptr inbounds i8, ptr %18, i64 %9
+  %wide.masked.load13 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %21, i32 1, <vscale x 16 x i1> %active.lane.mask10, <vscale x 16 x i8> poison)
+  %22 = mul <vscale x 16 x i8> %wide.masked.load, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %23 = mul <vscale x 16 x i8> %wide.masked.load11, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %24 = mul <vscale x 16 x i8> %wide.masked.load12, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %25 = mul <vscale x 16 x i8> %wide.masked.load13, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %26 = getelementptr inbounds i8, ptr %dst, i64 %index
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %22, ptr %26, i32 1, <vscale x 16 x i1> %active.lane.mask)
+  %27 = getelementptr inbounds i8, ptr %26, i64 %11
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %23, ptr %27, i32 1, <vscale x 16 x i1> %active.lane.mask8)
+  %28 = getelementptr inbounds i8, ptr %26, i64 %13
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %24, ptr %28, i32 1, <vscale x 16 x i1> %active.lane.mask9)
+  %29 = getelementptr inbounds i8, ptr %26, i64 %15
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %25, ptr %29, i32 1, <vscale x 16 x i1> %active.lane.mask10)
+  %index.next = add i64 %index, %1
+  %index.next16 = add i64 %index, %1
+  %30 = add i64 %index.next, %17
+  %active.lane.mask.next = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 %30, i64 %wide.trip.count)
+  %active.lane.mask.next17 = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %active.lane.mask.next18 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.next, i64 16)
+  %active.lane.mask.next19 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.next, i64 0)
+  %active.lane.mask.next20 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.next17, i64 16)
+  %active.lane.mask.next21 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.next17, i64 0)
+  %31 = extractelement <vscale x 16 x i1> %active.lane.mask.next21, i64 0
+  br i1 %31, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; SVE2.1, double-precision float elements, UF=4
+define void @f11(ptr noalias %dst, ptr readonly %src, i32 %n) #1 {
+; CHECK-LABEL: f11:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w2, #1
+; CHECK-NEXT:    b.lt .LBB11_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    cntw x10
+; CHECK-NEXT:    mov w8, w2
+; CHECK-NEXT:    fmov z0.d, #3.00000000
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    mov x15, xzr
+; CHECK-NEXT:    whilelo { p2.d, p3.d }, xzr, x8
+; CHECK-NEXT:    addvl x11, x0, #2
+; CHECK-NEXT:    whilelo { p0.d, p1.d }, x10, x8
+; CHECK-NEXT:    addvl x10, x0, #3
+; CHECK-NEXT:    addvl x12, x0, #1
+; CHECK-NEXT:    addvl x13, x1, #3
+; CHECK-NEXT:    addvl x14, x1, #2
+; CHECK-NEXT:    addvl x16, x1, #1
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB11_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1d { z1.d }, p2/z, [x1, x9, lsl #3]
+; CHECK-NEXT:    ld1d { z2.d }, p3/z, [x16, x9, lsl #3]
+; CHECK-NEXT:    inch x15
+; CHECK-NEXT:    fmul z1.d, z1.d, z0.d
+; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x14, x9, lsl #3]
+; CHECK-NEXT:    ld1d { z4.d }, p1/z, [x13, x9, lsl #3]
+; CHECK-NEXT:    fmul z2.d, z2.d, z0.d
+; CHECK-NEXT:    fmul z3.d, z3.d, z0.d
+; CHECK-NEXT:    fmul z4.d, z4.d, z0.d
+; CHECK-NEXT:    st1d { z1.d }, p2, [x0, x9, lsl #3]
+; CHECK-NEXT:    st1d { z2.d }, p3, [x12, x9, lsl #3]
+; CHECK-NEXT:    whilelo { p2.d, p3.d }, x15, x8
+; CHECK-NEXT:    mov z1.d, p2/z, #1 // =0x1
+; CHECK-NEXT:    st1d { z3.d }, p0, [x11, x9, lsl #3]
+; CHECK-NEXT:    st1d { z4.d }, p1, [x10, x9, lsl #3]
+; CHECK-NEXT:    incw x9, all, mul #3
+; CHECK-NEXT:    whilelo { p0.d, p1.d }, x9, x8
+; CHECK-NEXT:    mov x9, x15
+; CHECK-NEXT:    fmov x17, d1
+; CHECK-NEXT:    tbnz w17, #0, .LBB11_2
+; CHECK-NEXT:  .LBB11_3: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 3
+  %2 = tail call i64 @llvm.vscale.i64()
+  %3 = shl nuw nsw i64 %2, 2
+  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %3, i64 %wide.trip.count)
+  %active.lane.mask.entry3 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
+  %active.lane.mask.entry4 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.entry, i64 2)
+  %active.lane.mask.entry5 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.entry, i64 0)
+  %active.lane.mask.entry6 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.entry3, i64 2)
+  %active.lane.mask.entry7 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.entry3, i64 0)
+  %4 = tail call i64 @llvm.vscale.i64()
+  %5 = shl nuw nsw i64 %4, 1
+  %6 = tail call i64 @llvm.vscale.i64()
+  %7 = shl nuw nsw i64 %6, 2
+  %8 = tail call i64 @llvm.vscale.i64()
+  %9 = mul nuw nsw i64 %8, 6
+  %10 = tail call i64 @llvm.vscale.i64()
+  %11 = shl nuw nsw i64 %10, 1
+  %12 = tail call i64 @llvm.vscale.i64()
+  %13 = shl nuw nsw i64 %12, 2
+  %14 = tail call i64 @llvm.vscale.i64()
+  %15 = mul nuw nsw i64 %14, 6
+  %16 = tail call i64 @llvm.vscale.i64()
+  %17 = shl nuw nsw i64 %16, 2
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next16, %vector.body ]
+  %active.lane.mask = phi <vscale x 2 x i1> [ %active.lane.mask.entry7, %for.body.preheader ], [ %active.lane.mask.next21, %vector.body ]
+  %active.lane.mask8 = phi <vscale x 2 x i1> [ %active.lane.mask.entry6, %for.body.preheader ], [ %active.lane.mask.next20, %vector.body ]
+  %active.lane.mask9 = phi <vscale x 2 x i1> [ %active.lane.mask.entry5, %for.body.preheader ], [ %active.lane.mask.next19, %vector.body ]
+  %active.lane.mask10 = phi <vscale x 2 x i1> [ %active.lane.mask.entry4, %for.body.preheader ], [ %active.lane.mask.next18, %vector.body ]
+  %18 = getelementptr inbounds double, ptr %src, i64 %index
+  %wide.masked.load = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr %18, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x double> poison)
+  %19 = getelementptr inbounds double, ptr %18, i64 %5
+  %wide.masked.load11 = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull %19, i32 8, <vscale x 2 x i1> %active.lane.mask8, <vscale x 2 x double> poison)
+  %20 = getelementptr inbounds double, ptr %18, i64 %7
+  %wide.masked.load12 = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull %20, i32 8, <vscale x 2 x i1> %active.lane.mask9, <vscale x 2 x double> poison)
+  %21 = getelementptr inbounds double, ptr %18, i64 %9
+  %wide.masked.load13 = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull %21, i32 8, <vscale x 2 x i1> %active.lane.mask10, <vscale x 2 x double> poison)
+  %22 = fmul <vscale x 2 x double> %wide.masked.load, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %23 = fmul <vscale x 2 x double> %wide.masked.load11, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %24 = fmul <vscale x 2 x double> %wide.masked.load12, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %25 = fmul <vscale x 2 x double> %wide.masked.load13, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %26 = getelementptr inbounds double, ptr %dst, i64 %index
+  tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %22, ptr %26, i32 8, <vscale x 2 x i1> %active.lane.mask)
+  %27 = getelementptr inbounds double, ptr %26, i64 %11
+  tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %23, ptr %27, i32 8, <vscale x 2 x i1> %active.lane.mask8)
+  %28 = getelementptr inbounds double, ptr %26, i64 %13
+  tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %24, ptr %28, i32 8, <vscale x 2 x i1> %active.lane.mask9)
+  %29 = getelementptr inbounds double, ptr %26, i64 %15
+  tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %25, ptr %29, i32 8, <vscale x 2 x i1> %active.lane.mask10)
+  %index.next = add i64 %index, %1
+  %index.next16 = add i64 %index, %1
+  %30 = add i64 %index.next, %17
+  %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %30, i64 %wide.trip.count)
+  %active.lane.mask.next17 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %active.lane.mask.next18 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.next, i64 2)
+  %active.lane.mask.next19 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.next, i64 0)
+  %active.lane.mask.next20 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.next17, i64 2)
+  %active.lane.mask.next21 = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> %active.lane.mask.next17, i64 0)
+  %31 = extractelement <vscale x 2 x i1> %active.lane.mask.next21, i64 0
+  br i1 %31, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64)
+declare <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1>, i64 immarg)
+declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nocapture, i32 immarg, <vscale x 16 x i1>, <vscale x 16 x i8>)
+declare <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nocapture, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x double>)
+declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64)
+declare <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1>, i64 immarg)
+declare <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64, i64)
+declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64)
+declare i64 @llvm.vscale.i64()
+declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr nocapture, i32 immarg, <vscale x 16 x i1>)
+declare void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double>, ptr nocapture, i32 immarg, <vscale x 2 x i1>)
+
+attributes #0 = { nounwind vscale_range(1,16) "target-cpu"="neoverse-v1" }
+attributes #1 = { nounwind vscale_range(1,16) "target-cpu"="neoverse-v1" "target-features"="+sve2p1" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index fc67fb5aded6a8d..b1c91cac01dabd8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -42,22 +42,22 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP6]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP7]] = fadd <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
+; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP8]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP9]] = fadd <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[TMP7]])
+; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[TMP9]])
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-UNORDERED:       scalar.ph:
@@ -89,18 +89,18 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP6]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP7]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
+; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP8]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP9]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[WIDE_LOAD]])
+; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-ORDERED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-ORDERED:       middle.block:
@@ -108,7 +108,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED:       scalar.ph:
 ; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED:       for.body:
 ; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -120,7 +120,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    ret float [[ADD_LCSSA]]
 ;
 ; CHECK-ORDERED-TF-LABEL: define float @fadd_strict
@@ -136,27 +136,27 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[TMP8]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[N]], [[TMP8]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
 ; CHECK-ORDERED-TF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       vector.body:
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP10]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP12]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP13]])
-; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
+; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP12]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP14]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP16]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP15]])
+; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP11]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 8 x i1> [[TMP17]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -164,7 +164,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED-TF:       scalar.ph:
 ; CHECK-ORDERED-TF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       for.body:
 ; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -176,7 +176,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    ret float [[ADD_LCSSA]]
 ;
 
@@ -230,60 +230,60 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    [[TMP38:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP39:%.*]] = mul i64 [[TMP38]], 32
+; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
-; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
-; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]]
-; CHECK-UNORDERED-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]]
-; CHECK-UNORDERED-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP33]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP34]] = fadd <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT:    [[TMP35]] = fadd <vscale x 8 x float> [[WIDE_LOAD4]], [[VEC_PHI1]]
-; CHECK-UNORDERED-NEXT:    [[TMP36]] = fadd <vscale x 8 x float> [[WIDE_LOAD5]], [[VEC_PHI2]]
-; CHECK-UNORDERED-NEXT:    [[TMP37]] = fadd <vscale x 8 x float> [[WIDE_LOAD6]], [[VEC_PHI3]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP39]]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 1
+; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
+; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 1
+; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]]
+; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[TMP18]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 1
+; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]]
+; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
+; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]]
+; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]]
+; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]]
+; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0
+; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]]
+; CHECK-UNORDERED-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]]
+; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP34:%.*]] = mul i64 [[TMP33]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP26]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP29]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP32]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP35]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP36]] = fadd <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-UNORDERED-NEXT:    [[TMP37]] = fadd <vscale x 8 x float> [[WIDE_LOAD4]], [[VEC_PHI1]]
+; CHECK-UNORDERED-NEXT:    [[TMP38]] = fadd <vscale x 8 x float> [[WIDE_LOAD5]], [[VEC_PHI2]]
+; CHECK-UNORDERED-NEXT:    [[TMP39]] = fadd <vscale x 8 x float> [[WIDE_LOAD6]], [[VEC_PHI3]]
+; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-UNORDERED-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd <vscale x 8 x float> [[TMP35]], [[TMP34]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX7:%.*]] = fadd <vscale x 8 x float> [[TMP36]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX8:%.*]] = fadd <vscale x 8 x float> [[TMP37]], [[BIN_RDX7]]
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd <vscale x 8 x float> [[TMP37]], [[TMP36]]
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX7:%.*]] = fadd <vscale x 8 x float> [[TMP38]], [[BIN_RDX]]
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX8:%.*]] = fadd <vscale x 8 x float> [[TMP39]], [[BIN_RDX7]]
 ; CHECK-UNORDERED-NEXT:    [[TMP41:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX8]])
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -316,51 +316,51 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    [[TMP38:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP39:%.*]] = mul i64 [[TMP38]], 32
+; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
-; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
-; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
-; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]]
-; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 16
-; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]]
-; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 24
-; CHECK-ORDERED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, ptr [[TMP33]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP34:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT:    [[TMP35:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP34]], <vscale x 8 x float> [[WIDE_LOAD1]])
-; CHECK-ORDERED-NEXT:    [[TMP36:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP35]], <vscale x 8 x float> [[WIDE_LOAD2]])
-; CHECK-ORDERED-NEXT:    [[TMP37]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP36]], <vscale x 8 x float> [[WIDE_LOAD3]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP39]]
+; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 0
+; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 1
+; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
+; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], 0
+; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 1
+; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]]
+; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 24
+; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[TMP18]], 0
+; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 1
+; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]]
+; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
+; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]]
+; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]]
+; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]]
+; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0
+; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
+; CHECK-ORDERED-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]]
+; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 16
+; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]]
+; CHECK-ORDERED-NEXT:    [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP34:%.*]] = mul i64 [[TMP33]], 24
+; CHECK-ORDERED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]]
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP26]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP29]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP32]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, ptr [[TMP35]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP36:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[WIDE_LOAD]])
+; CHECK-ORDERED-NEXT:    [[TMP37:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP36]], <vscale x 8 x float> [[WIDE_LOAD1]])
+; CHECK-ORDERED-NEXT:    [[TMP38:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP37]], <vscale x 8 x float> [[WIDE_LOAD2]])
+; CHECK-ORDERED-NEXT:    [[TMP39]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP38]], <vscale x 8 x float> [[WIDE_LOAD3]])
+; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-ORDERED-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-ORDERED-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-ORDERED:       middle.block:
@@ -368,7 +368,7 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED:       scalar.ph:
 ; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED:       for.body:
 ; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -380,7 +380,7 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    ret float [[ADD_LCSSA]]
 ;
 ; CHECK-ORDERED-TF-LABEL: define float @fadd_strict_unroll
@@ -396,92 +396,94 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP69:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP70:%.*]] = mul i64 [[TMP69]], 32
 ; CHECK-ORDERED-TF-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 32
-; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 32
-; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = sub i64 [[N]], [[TMP11]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 32
-; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = sub i64 [[N]], [[TMP16]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = icmp ugt i64 [[N]], [[TMP16]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 32
-; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = sub i64 [[N]], [[TMP21]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = icmp ugt i64 [[N]], [[TMP21]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
-; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[TMP8]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[N]], [[TMP8]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = sub i64 [[N]], [[TMP13]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = icmp ugt i64 [[N]], [[TMP13]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = sub i64 [[N]], [[TMP18]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = icmp ugt i64 [[N]], [[TMP18]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[TMP19]], i64 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = sub i64 [[N]], [[TMP23]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = icmp ugt i64 [[N]], [[TMP23]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i64 [[TMP24]], i64 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 16
-; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
+; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP28]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 24
-; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 16
+; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP30]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 24
+; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP32]]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 8)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY6:%.*]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_ENTRY3]], i64 8)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY7:%.*]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_ENTRY3]], i64 0)
 ; CHECK-ORDERED-TF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       vector.body:
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK6:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP68:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP32]], 8
-; CHECK-ORDERED-TF-NEXT:    [[TMP34:%.*]] = add i64 [[TMP33]], 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 1
-; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = add i64 [[INDEX]], [[TMP35]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP38:%.*]] = mul i64 [[TMP37]], 16
-; CHECK-ORDERED-TF-NEXT:    [[TMP39:%.*]] = add i64 [[TMP38]], 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 1
-; CHECK-ORDERED-TF-NEXT:    [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 24
-; CHECK-ORDERED-TF-NEXT:    [[TMP44:%.*]] = add i64 [[TMP43]], 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 1
-; CHECK-ORDERED-TF-NEXT:    [[TMP46:%.*]] = add i64 [[INDEX]], [[TMP45]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP47:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP36]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP52:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP53:%.*]] = mul i64 [[TMP52]], 8
-; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP55:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP56:%.*]] = mul i64 [[TMP55]], 16
-; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP58]], 24
-; CHECK-ORDERED-TF-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP61:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP62:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP61]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP63:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> [[WIDE_MASKED_LOAD9]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP64:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP62]], <vscale x 8 x float> [[TMP63]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP65:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> [[WIDE_MASKED_LOAD10]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY7]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY6]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK10:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP70:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = add i64 [[INDEX]], 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP37:%.*]] = mul i64 [[TMP36]], 1
+; CHECK-ORDERED-TF-NEXT:    [[TMP38:%.*]] = add i64 [[INDEX]], [[TMP37]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP41:%.*]] = add i64 [[TMP40]], 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 1
+; CHECK-ORDERED-TF-NEXT:    [[TMP43:%.*]] = add i64 [[INDEX]], [[TMP42]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP44:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP46:%.*]] = add i64 [[TMP45]], 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP47:%.*]] = mul i64 [[TMP46]], 1
+; CHECK-ORDERED-TF-NEXT:    [[TMP48:%.*]] = add i64 [[INDEX]], [[TMP47]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP33]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP38]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP43]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP48]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP53:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i32 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP54]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP55]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP57]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP58]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP60:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP61:%.*]] = mul i64 [[TMP60]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP61]]
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP53]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP56]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP59]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP62]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK10]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP63:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP64:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP63]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP65:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[WIDE_MASKED_LOAD11]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP66:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP64]], <vscale x 8 x float> [[TMP65]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP67:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[WIDE_MASKED_LOAD11]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP68]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP66]], <vscale x 8 x float> [[TMP67]])
-; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP70]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP67:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 8 x float> [[WIDE_MASKED_LOAD12]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP68:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP66]], <vscale x 8 x float> [[TMP67]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP69:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK10]], <vscale x 8 x float> [[WIDE_MASKED_LOAD13]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP70]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP68]], <vscale x 8 x float> [[TMP69]])
+; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP71:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP72:%.*]] = mul i64 [[TMP71]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP73:%.*]] = add i64 [[INDEX]], [[TMP72]]
@@ -491,21 +493,23 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP77:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP78:%.*]] = mul i64 [[TMP77]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP79:%.*]] = add i64 [[INDEX]], [[TMP78]]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT12]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP73]], i64 [[TMP14]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT13]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP76]], i64 [[TMP19]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT14]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP79]], i64 [[TMP24]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP80:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP81:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT12]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP82:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT13]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP83:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT14]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP76]], i64 [[TMP11]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT14:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT15]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 8)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT14]], i64 8)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT14]], i64 0)
+; CHECK-ORDERED-TF-NEXT:    [[TMP80:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT18]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP81:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT17]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP82:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT16]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP83:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT15]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP84:%.*]] = extractelement <vscale x 8 x i1> [[TMP80]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED-TF:       scalar.ph:
 ; CHECK-ORDERED-TF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP68]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP70]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       for.body:
 ; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -517,7 +521,7 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP68]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP70]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    ret float [[ADD_LCSSA]]
 ;
 
@@ -589,37 +593,37 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-UNORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = insertelement <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float [[A2]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = insertelement <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float [[A1]], i32 0
+; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float [[A2]], i32 0
+; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = insertelement <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float [[A1]], i32 0
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x float> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x float> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x float>, ptr [[TMP11]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]]
+; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
+; CHECK-UNORDERED-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x float>, ptr [[TMP13]], align 4
 ; CHECK-UNORDERED-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_VEC]])
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP14]] = fadd <vscale x 4 x float> [[TMP12]], [[VEC_PHI1]]
-; CHECK-UNORDERED-NEXT:    [[TMP15]] = fadd <vscale x 4 x float> [[TMP13]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]]
+; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
+; CHECK-UNORDERED-NEXT:    [[TMP16]] = fadd <vscale x 4 x float> [[TMP14]], [[VEC_PHI1]]
+; CHECK-UNORDERED-NEXT:    [[TMP17]] = fadd <vscale x 4 x float> [[TMP15]], [[VEC_PHI]]
+; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP15]])
-; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP14]])
+; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP17]])
+; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP16]])
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-UNORDERED:       scalar.ph:
 ; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
+; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
+; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
 ; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-UNORDERED:       for.body:
 ; CHECK-UNORDERED-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
@@ -636,8 +640,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT:    [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; CHECK-UNORDERED-NEXT:    [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; CHECK-UNORDERED-NEXT:    [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
 ; CHECK-UNORDERED-NEXT:    store float [[ADD1_LCSSA]], ptr [[A]], align 4
 ; CHECK-UNORDERED-NEXT:    store float [[ADD2_LCSSA]], ptr [[ARRAYIDXA]], align 4
 ; CHECK-UNORDERED-NEXT:    ret void
@@ -661,24 +665,24 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-ORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
-; CHECK-ORDERED-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x float>, ptr [[TMP9]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
+; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
+; CHECK-ORDERED-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x float>, ptr [[TMP11]], align 4
 ; CHECK-ORDERED-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_VEC]])
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
-; CHECK-ORDERED-NEXT:    [[TMP12]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP11]])
-; CHECK-ORDERED-NEXT:    [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], <vscale x 4 x float> [[TMP10]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
+; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
+; CHECK-ORDERED-NEXT:    [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
+; CHECK-ORDERED-NEXT:    [[TMP15]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], <vscale x 4 x float> [[TMP12]])
+; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-ORDERED-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-ORDERED:       middle.block:
@@ -686,8 +690,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED:       scalar.ph:
 ; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED:       for.body:
 ; CHECK-ORDERED-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
@@ -704,8 +708,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT:    [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    store float [[ADD1_LCSSA]], ptr [[A]], align 4
 ; CHECK-ORDERED-NEXT:    store float [[ADD2_LCSSA]], ptr [[ARRAYIDXA]], align 4
 ; CHECK-ORDERED-NEXT:    ret void
@@ -730,35 +734,35 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-ORDERED-TF-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 4
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
-; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[TMP2]], [[TMP9]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = sub i64 [[TMP2]], [[TMP11]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[TMP2]], [[TMP11]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]])
 ; CHECK-ORDERED-TF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       vector.body:
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP15]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.experimental.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP15]], i32 4, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP17]], i32 4, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_MASKED_VEC]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
-; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP17]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP19]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP18]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP16]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], <vscale x 4 x float> [[TMP20]])
-; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
+; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP20]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP23]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], <vscale x 4 x float> [[TMP22]])
+; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 4 x i1> [[TMP24]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -766,8 +770,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED-TF:       scalar.ph:
 ; CHECK-ORDERED-TF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       for.body:
 ; CHECK-ORDERED-TF-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
@@ -784,8 +788,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    store float [[ADD1_LCSSA]], ptr [[A]], align 4
 ; CHECK-ORDERED-TF-NEXT:    store float [[ADD2_LCSSA]], ptr [[ARRAYIDXA]], align 4
 ; CHECK-ORDERED-TF-NEXT:    ret void
@@ -867,26 +871,26 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
 ; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]]
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
 ; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-UNORDERED-NEXT:    [[TMP11]] = fadd <vscale x 4 x float> [[VEC_PHI]], [[TMP10]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
+; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-UNORDERED-NEXT:    [[TMP13]] = fadd <vscale x 4 x float> [[VEC_PHI]], [[TMP12]]
+; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP11]])
+; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP13]])
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-UNORDERED:       scalar.ph:
@@ -929,22 +933,22 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
 ; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]]
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]]
+; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
 ; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-ORDERED-NEXT:    [[TMP11]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP10]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
+; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-ORDERED-NEXT:    [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP12]])
+; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-ORDERED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-ORDERED:       middle.block:
@@ -952,7 +956,7 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED:       scalar.ph:
 ; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED:       for.body:
 ; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -967,7 +971,7 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
 ; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-ORDERED:       for.end.loopexit:
-; CHECK-ORDERED-NEXT:    [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_END]]
 ; CHECK-ORDERED:       for.end:
 ; CHECK-ORDERED-NEXT:    [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
@@ -991,31 +995,31 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
 ; CHECK-ORDERED-TF-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP5]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 4
 ; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
-; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = sub i64 [[N]], [[TMP7]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[N]], [[TMP7]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = sub i64 [[N]], [[TMP9]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[N]], [[TMP9]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
 ; CHECK-ORDERED-TF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       vector.body:
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]]
+; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP15]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = fadd <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP16]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP18]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP17]])
-; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP10]])
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP15]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP17]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = fadd <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP20]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP19]])
+; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[TMP21]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -1023,7 +1027,7 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
 ; CHECK-ORDERED-TF-NEXT:    br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED-TF:       scalar.ph:
 ; CHECK-ORDERED-TF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       for.body:
 ; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -1038,7 +1042,7 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
 ; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end.loopexit:
-; CHECK-ORDERED-TF-NEXT:    [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_END]]
 ; CHECK-ORDERED-TF:       for.end:
 ; CHECK-ORDERED-TF-NEXT:    [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
@@ -1110,28 +1114,28 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float 1.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x float> poison)
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = xor <vscale x 4 x i1> [[TMP7]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-UNORDERED-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD]]
-; CHECK-UNORDERED-NEXT:    [[TMP11]] = fadd <vscale x 4 x float> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float 1.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]]
+; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP6]]
+; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0
+; CHECK-UNORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
+; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = xor <vscale x 4 x i1> [[TMP9]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-UNORDERED-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD]]
+; CHECK-UNORDERED-NEXT:    [[TMP13]] = fadd <vscale x 4 x float> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP11]])
+; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP13]])
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-UNORDERED:       scalar.ph:
@@ -1171,24 +1175,24 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i32 0
-; CHECK-ORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x float> poison)
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = xor <vscale x 4 x i1> [[TMP7]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD]]
-; CHECK-ORDERED-NEXT:    [[TMP11]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[PREDPHI]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
+; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]]
+; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP6]]
+; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0
+; CHECK-ORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = xor <vscale x 4 x i1> [[TMP9]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD]]
+; CHECK-ORDERED-NEXT:    [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[PREDPHI]])
+; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-ORDERED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-ORDERED:       middle.block:
@@ -1196,7 +1200,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED:       scalar.ph:
 ; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED:       for.body:
 ; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
@@ -1216,7 +1220,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    ret float [[RDX]]
 ;
 ; CHECK-ORDERED-TF-LABEL: define float @fadd_conditional
@@ -1232,36 +1236,36 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-TF-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 4
 ; CHECK-ORDERED-TF-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[TMP8]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[N]], [[TMP8]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
 ; CHECK-ORDERED-TF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       vector.body:
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP10]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_MASKED_LOAD]], zeroinitializer
-; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
-; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP10]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = getelementptr float, ptr [[TMP14]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> zeroinitializer
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = or <vscale x 4 x i1> [[TMP15]], [[TMP18]]
-; CHECK-ORDERED-TF-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD1]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP20]])
-; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
+; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP12]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> zeroinitializer
+; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP12]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = getelementptr float, ptr [[TMP17]], i32 0
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[TMP16]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = xor <vscale x 4 x i1> [[TMP15]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i1> zeroinitializer
+; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = or <vscale x 4 x i1> [[TMP16]], [[TMP20]]
+; CHECK-ORDERED-TF-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP20]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD1]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = select <vscale x 4 x i1> [[TMP21]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP23]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP22]])
+; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP11]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 4 x i1> [[TMP24]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1269,7 +1273,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-TF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED-TF:       scalar.ph:
 ; CHECK-ORDERED-TF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       for.body:
 ; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
@@ -1289,7 +1293,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    ret float [[RDX]]
 ;
 
@@ -1358,26 +1362,26 @@ define float @fadd_multiple(ptr noalias nocapture %a, ptr noalias nocapture %b,
 ; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float -0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP6]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = fadd <vscale x 8 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP9]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP10]] = fadd <vscale x 8 x float> [[TMP7]], [[WIDE_LOAD1]]
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float -0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
+; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP8]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = fadd <vscale x 8 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]]
+; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP11]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP12]] = fadd <vscale x 8 x float> [[TMP9]], [[WIDE_LOAD1]]
+; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[TMP10]])
+; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[TMP12]])
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-UNORDERED:       scalar.ph:
@@ -1498,78 +1502,78 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    [[TMP52:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP53:%.*]] = mul i64 [[TMP52]], 32
+; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
-; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
-; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]]
-; CHECK-UNORDERED-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]]
-; CHECK-UNORDERED-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP33]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]]
-; CHECK-UNORDERED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]]
-; CHECK-UNORDERED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]]
-; CHECK-UNORDERED-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]]
-; CHECK-UNORDERED-NEXT:    [[TMP45:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP46:%.*]] = mul i64 [[TMP45]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP38]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, ptr [[TMP41]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x float>, ptr [[TMP44]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 8 x float>, ptr [[TMP47]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP48]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI]])
-; CHECK-UNORDERED-NEXT:    [[TMP49]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[WIDE_LOAD8]], <vscale x 8 x float> [[VEC_PHI1]])
-; CHECK-UNORDERED-NEXT:    [[TMP50]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[WIDE_LOAD9]], <vscale x 8 x float> [[VEC_PHI2]])
-; CHECK-UNORDERED-NEXT:    [[TMP51]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[WIDE_LOAD10]], <vscale x 8 x float> [[VEC_PHI3]])
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP53]]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP53:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 1
+; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
+; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 1
+; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]]
+; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[TMP18]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 1
+; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]]
+; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
+; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]]
+; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]]
+; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]]
+; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0
+; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]]
+; CHECK-UNORDERED-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]]
+; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP34:%.*]] = mul i64 [[TMP33]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP26]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP29]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP32]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP35]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]]
+; CHECK-UNORDERED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]]
+; CHECK-UNORDERED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP16]]
+; CHECK-UNORDERED-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP21]]
+; CHECK-UNORDERED-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i32 0
+; CHECK-UNORDERED-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP42]]
+; CHECK-UNORDERED-NEXT:    [[TMP44:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP45]]
+; CHECK-UNORDERED-NEXT:    [[TMP47:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP47]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP48]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP40]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, ptr [[TMP43]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x float>, ptr [[TMP46]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 8 x float>, ptr [[TMP49]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP50]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED-NEXT:    [[TMP51]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[WIDE_LOAD8]], <vscale x 8 x float> [[VEC_PHI1]])
+; CHECK-UNORDERED-NEXT:    [[TMP52]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[WIDE_LOAD9]], <vscale x 8 x float> [[VEC_PHI2]])
+; CHECK-UNORDERED-NEXT:    [[TMP53]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[WIDE_LOAD10]], <vscale x 8 x float> [[VEC_PHI3]])
+; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-UNORDERED-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd <vscale x 8 x float> [[TMP49]], [[TMP48]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX11:%.*]] = fadd <vscale x 8 x float> [[TMP50]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX12:%.*]] = fadd <vscale x 8 x float> [[TMP51]], [[BIN_RDX11]]
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd <vscale x 8 x float> [[TMP51]], [[TMP50]]
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX11:%.*]] = fadd <vscale x 8 x float> [[TMP52]], [[BIN_RDX]]
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX12:%.*]] = fadd <vscale x 8 x float> [[TMP53]], [[BIN_RDX11]]
 ; CHECK-UNORDERED-NEXT:    [[TMP55:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX12]])
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1604,73 +1608,73 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    [[TMP56:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP57:%.*]] = mul i64 [[TMP56]], 32
+; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP55:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
-; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
-; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
-; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]]
-; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 16
-; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]]
-; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 24
-; CHECK-ORDERED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, ptr [[TMP33]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]]
-; CHECK-ORDERED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]]
-; CHECK-ORDERED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 8
-; CHECK-ORDERED-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]]
-; CHECK-ORDERED-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 16
-; CHECK-ORDERED-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]]
-; CHECK-ORDERED-NEXT:    [[TMP45:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP46:%.*]] = mul i64 [[TMP45]], 24
-; CHECK-ORDERED-NEXT:    [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP38]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP41]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP44]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP47]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP48:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
-; CHECK-ORDERED-NEXT:    [[TMP49:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
-; CHECK-ORDERED-NEXT:    [[TMP50:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
-; CHECK-ORDERED-NEXT:    [[TMP51:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; CHECK-ORDERED-NEXT:    [[TMP52:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP48]])
-; CHECK-ORDERED-NEXT:    [[TMP53:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP52]], <vscale x 8 x float> [[TMP49]])
-; CHECK-ORDERED-NEXT:    [[TMP54:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP53]], <vscale x 8 x float> [[TMP50]])
-; CHECK-ORDERED-NEXT:    [[TMP55]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP54]], <vscale x 8 x float> [[TMP51]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP57]]
+; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP57:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 0
+; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 1
+; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
+; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], 0
+; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 1
+; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]]
+; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 24
+; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[TMP18]], 0
+; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 1
+; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]]
+; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
+; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]]
+; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]]
+; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]]
+; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0
+; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
+; CHECK-ORDERED-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]]
+; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 16
+; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]]
+; CHECK-ORDERED-NEXT:    [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP34:%.*]] = mul i64 [[TMP33]], 24
+; CHECK-ORDERED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]]
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP26]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP29]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP32]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, ptr [[TMP35]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]]
+; CHECK-ORDERED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]]
+; CHECK-ORDERED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP16]]
+; CHECK-ORDERED-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP21]]
+; CHECK-ORDERED-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i32 0
+; CHECK-ORDERED-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 8
+; CHECK-ORDERED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP42]]
+; CHECK-ORDERED-NEXT:    [[TMP44:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 16
+; CHECK-ORDERED-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP45]]
+; CHECK-ORDERED-NEXT:    [[TMP47:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP47]], 24
+; CHECK-ORDERED-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP48]]
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP40]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP43]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP46]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP49]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP50:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; CHECK-ORDERED-NEXT:    [[TMP51:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; CHECK-ORDERED-NEXT:    [[TMP52:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; CHECK-ORDERED-NEXT:    [[TMP53:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-ORDERED-NEXT:    [[TMP54:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP50]])
+; CHECK-ORDERED-NEXT:    [[TMP55:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP54]], <vscale x 8 x float> [[TMP51]])
+; CHECK-ORDERED-NEXT:    [[TMP56:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP55]], <vscale x 8 x float> [[TMP52]])
+; CHECK-ORDERED-NEXT:    [[TMP57]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP56]], <vscale x 8 x float> [[TMP53]])
+; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-ORDERED-NEXT:    [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-ORDERED-NEXT:    br i1 [[TMP58]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-ORDERED:       middle.block:
@@ -1678,7 +1682,7 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED:       scalar.ph:
 ; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP57]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED:       for.body:
 ; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -1692,7 +1696,7 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP57]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    ret float [[MULADD_LCSSA]]
 ;
 ; CHECK-ORDERED-TF-LABEL: define float @fmuladd_strict
@@ -1708,114 +1712,116 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP87:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP88:%.*]] = mul i64 [[TMP87]], 32
 ; CHECK-ORDERED-TF-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 32
-; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 32
-; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = sub i64 [[N]], [[TMP11]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 32
-; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = sub i64 [[N]], [[TMP16]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = icmp ugt i64 [[N]], [[TMP16]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 32
-; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = sub i64 [[N]], [[TMP21]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = icmp ugt i64 [[N]], [[TMP21]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
-; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[TMP8]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[N]], [[TMP8]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = sub i64 [[N]], [[TMP13]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = icmp ugt i64 [[N]], [[TMP13]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = sub i64 [[N]], [[TMP18]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = icmp ugt i64 [[N]], [[TMP18]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[TMP19]], i64 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = sub i64 [[N]], [[TMP23]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = icmp ugt i64 [[N]], [[TMP23]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i64 [[TMP24]], i64 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 16
-; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
+; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP28]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 24
-; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 16
+; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP30]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 24
+; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP32]]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 8)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY6:%.*]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_ENTRY3]], i64 8)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY7:%.*]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_ENTRY3]], i64 0)
 ; CHECK-ORDERED-TF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       vector.body:
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK6:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP86:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP32]], 8
-; CHECK-ORDERED-TF-NEXT:    [[TMP34:%.*]] = add i64 [[TMP33]], 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 1
-; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = add i64 [[INDEX]], [[TMP35]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP38:%.*]] = mul i64 [[TMP37]], 16
-; CHECK-ORDERED-TF-NEXT:    [[TMP39:%.*]] = add i64 [[TMP38]], 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 1
-; CHECK-ORDERED-TF-NEXT:    [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 24
-; CHECK-ORDERED-TF-NEXT:    [[TMP44:%.*]] = add i64 [[TMP43]], 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 1
-; CHECK-ORDERED-TF-NEXT:    [[TMP46:%.*]] = add i64 [[INDEX]], [[TMP45]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP47:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP36]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP52:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP53:%.*]] = mul i64 [[TMP52]], 8
-; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP55:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP56:%.*]] = mul i64 [[TMP55]], 16
-; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP58]], 24
-; CHECK-ORDERED-TF-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP61:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP31]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP36]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP41]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP46]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP66:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP67:%.*]] = mul i64 [[TMP66]], 8
-; CHECK-ORDERED-TF-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP67]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP69:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP70:%.*]] = mul i64 [[TMP69]], 16
-; CHECK-ORDERED-TF-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP70]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP72:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP73:%.*]] = mul i64 [[TMP72]], 24
-; CHECK-ORDERED-TF-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP73]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP71]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP74]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP75:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP76:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP77:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY7]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY6]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK10:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP88:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = add i64 [[INDEX]], 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP37:%.*]] = mul i64 [[TMP36]], 1
+; CHECK-ORDERED-TF-NEXT:    [[TMP38:%.*]] = add i64 [[INDEX]], [[TMP37]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP41:%.*]] = add i64 [[TMP40]], 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 1
+; CHECK-ORDERED-TF-NEXT:    [[TMP43:%.*]] = add i64 [[INDEX]], [[TMP42]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP44:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP46:%.*]] = add i64 [[TMP45]], 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP47:%.*]] = mul i64 [[TMP46]], 1
+; CHECK-ORDERED-TF-NEXT:    [[TMP48:%.*]] = add i64 [[INDEX]], [[TMP47]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP33]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP38]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP43]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP48]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP53:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i32 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP54]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP55]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP57]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP58]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP60:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP61:%.*]] = mul i64 [[TMP60]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP61]]
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP53]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP56]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP59]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP62]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK10]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP33]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP38]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP43]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP48]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP67:%.*]] = getelementptr inbounds float, ptr [[TMP63]], i32 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP68:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP69:%.*]] = mul i64 [[TMP68]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP70:%.*]] = getelementptr inbounds float, ptr [[TMP63]], i64 [[TMP69]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP71:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP72:%.*]] = mul i64 [[TMP71]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP73:%.*]] = getelementptr inbounds float, ptr [[TMP63]], i64 [[TMP72]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP74:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP75:%.*]] = mul i64 [[TMP74]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP76:%.*]] = getelementptr inbounds float, ptr [[TMP63]], i64 [[TMP75]]
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP67]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP70]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP73]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD17:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP76]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK10]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP77:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD14]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP78:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP79:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[TMP75]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP80:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP79]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP81:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> [[TMP76]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP82:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP80]], <vscale x 8 x float> [[TMP81]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP83:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> [[TMP77]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP79:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD16]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP80:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD13]], [[WIDE_MASKED_LOAD17]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP81:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[TMP77]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP82:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP81]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP83:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[TMP78]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP84:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP82]], <vscale x 8 x float> [[TMP83]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP85:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[TMP78]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP86]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], <vscale x 8 x float> [[TMP85]])
-; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP88]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP85:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 8 x float> [[TMP79]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP86:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], <vscale x 8 x float> [[TMP85]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP87:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK10]], <vscale x 8 x float> [[TMP80]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP88]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP86]], <vscale x 8 x float> [[TMP87]])
+; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP89:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP90:%.*]] = mul i64 [[TMP89]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP91:%.*]] = add i64 [[INDEX]], [[TMP90]]
@@ -1825,21 +1831,23 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP95:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP96:%.*]] = mul i64 [[TMP95]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP97:%.*]] = add i64 [[INDEX]], [[TMP96]]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP91]], i64 [[TMP14]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP94]], i64 [[TMP19]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP97]], i64 [[TMP24]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP98:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP99:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT16]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP100:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT17]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP101:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT18]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP94]], i64 [[TMP11]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT18:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT19]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 8)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT20]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT21]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT18]], i64 8)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT22]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT18]], i64 0)
+; CHECK-ORDERED-TF-NEXT:    [[TMP98:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT22]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP99:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT21]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP100:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT20]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP101:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT19]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP102:%.*]] = extractelement <vscale x 8 x i1> [[TMP98]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP102]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED-TF:       scalar.ph:
 ; CHECK-ORDERED-TF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP88]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       for.body:
 ; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -1853,7 +1861,7 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP88]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    ret float [[MULADD_LCSSA]]
 ;
 
@@ -1912,78 +1920,78 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    [[TMP52:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP53:%.*]] = mul i64 [[TMP52]], 32
+; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
-; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
-; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]]
-; CHECK-UNORDERED-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]]
-; CHECK-UNORDERED-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP33]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]]
-; CHECK-UNORDERED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]]
-; CHECK-UNORDERED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]]
-; CHECK-UNORDERED-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]]
-; CHECK-UNORDERED-NEXT:    [[TMP45:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP46:%.*]] = mul i64 [[TMP45]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP38]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, ptr [[TMP41]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x float>, ptr [[TMP44]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 8 x float>, ptr [[TMP47]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP48]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI]])
-; CHECK-UNORDERED-NEXT:    [[TMP49]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[WIDE_LOAD8]], <vscale x 8 x float> [[VEC_PHI1]])
-; CHECK-UNORDERED-NEXT:    [[TMP50]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[WIDE_LOAD9]], <vscale x 8 x float> [[VEC_PHI2]])
-; CHECK-UNORDERED-NEXT:    [[TMP51]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[WIDE_LOAD10]], <vscale x 8 x float> [[VEC_PHI3]])
-; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP53]]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP53:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 1
+; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
+; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 1
+; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]]
+; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[TMP18]], 0
+; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 1
+; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]]
+; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
+; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]]
+; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]]
+; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]]
+; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0
+; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]]
+; CHECK-UNORDERED-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]]
+; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP34:%.*]] = mul i64 [[TMP33]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP26]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP29]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP32]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP35]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]]
+; CHECK-UNORDERED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]]
+; CHECK-UNORDERED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP16]]
+; CHECK-UNORDERED-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP21]]
+; CHECK-UNORDERED-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i32 0
+; CHECK-UNORDERED-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP42]]
+; CHECK-UNORDERED-NEXT:    [[TMP44:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP45]]
+; CHECK-UNORDERED-NEXT:    [[TMP47:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP47]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP48]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP40]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, ptr [[TMP43]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x float>, ptr [[TMP46]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 8 x float>, ptr [[TMP49]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP50]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED-NEXT:    [[TMP51]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[WIDE_LOAD8]], <vscale x 8 x float> [[VEC_PHI1]])
+; CHECK-UNORDERED-NEXT:    [[TMP52]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[WIDE_LOAD9]], <vscale x 8 x float> [[VEC_PHI2]])
+; CHECK-UNORDERED-NEXT:    [[TMP53]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[WIDE_LOAD10]], <vscale x 8 x float> [[VEC_PHI3]])
+; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-UNORDERED-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd nnan <vscale x 8 x float> [[TMP49]], [[TMP48]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX11:%.*]] = fadd nnan <vscale x 8 x float> [[TMP50]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX12:%.*]] = fadd nnan <vscale x 8 x float> [[TMP51]], [[BIN_RDX11]]
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd nnan <vscale x 8 x float> [[TMP51]], [[TMP50]]
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX11:%.*]] = fadd nnan <vscale x 8 x float> [[TMP52]], [[BIN_RDX]]
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX12:%.*]] = fadd nnan <vscale x 8 x float> [[TMP53]], [[BIN_RDX11]]
 ; CHECK-UNORDERED-NEXT:    [[TMP55:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX12]])
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -2018,73 +2026,73 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    [[TMP56:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP57:%.*]] = mul i64 [[TMP56]], 32
+; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP55:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
-; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
-; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
-; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]]
-; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 16
-; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]]
-; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 24
-; CHECK-ORDERED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, ptr [[TMP33]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]]
-; CHECK-ORDERED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]]
-; CHECK-ORDERED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 8
-; CHECK-ORDERED-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]]
-; CHECK-ORDERED-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 16
-; CHECK-ORDERED-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]]
-; CHECK-ORDERED-NEXT:    [[TMP45:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP46:%.*]] = mul i64 [[TMP45]], 24
-; CHECK-ORDERED-NEXT:    [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP38]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP41]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP44]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP47]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP48:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
-; CHECK-ORDERED-NEXT:    [[TMP49:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
-; CHECK-ORDERED-NEXT:    [[TMP50:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
-; CHECK-ORDERED-NEXT:    [[TMP51:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; CHECK-ORDERED-NEXT:    [[TMP52:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP48]])
-; CHECK-ORDERED-NEXT:    [[TMP53:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP52]], <vscale x 8 x float> [[TMP49]])
-; CHECK-ORDERED-NEXT:    [[TMP54:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP53]], <vscale x 8 x float> [[TMP50]])
-; CHECK-ORDERED-NEXT:    [[TMP55]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP54]], <vscale x 8 x float> [[TMP51]])
-; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP57]]
+; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP57:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 0
+; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 1
+; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
+; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], 0
+; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 1
+; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]]
+; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 24
+; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = add i64 [[TMP18]], 0
+; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 1
+; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]]
+; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
+; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]]
+; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]]
+; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]]
+; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0
+; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
+; CHECK-ORDERED-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]]
+; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 16
+; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]]
+; CHECK-ORDERED-NEXT:    [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP34:%.*]] = mul i64 [[TMP33]], 24
+; CHECK-ORDERED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]]
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP26]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP29]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP32]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, ptr [[TMP35]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]]
+; CHECK-ORDERED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]]
+; CHECK-ORDERED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP16]]
+; CHECK-ORDERED-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP21]]
+; CHECK-ORDERED-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i32 0
+; CHECK-ORDERED-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 8
+; CHECK-ORDERED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP42]]
+; CHECK-ORDERED-NEXT:    [[TMP44:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 16
+; CHECK-ORDERED-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP45]]
+; CHECK-ORDERED-NEXT:    [[TMP47:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP47]], 24
+; CHECK-ORDERED-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP48]]
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP40]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP43]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP46]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP49]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP50:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; CHECK-ORDERED-NEXT:    [[TMP51:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; CHECK-ORDERED-NEXT:    [[TMP52:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; CHECK-ORDERED-NEXT:    [[TMP53:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-ORDERED-NEXT:    [[TMP54:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP50]])
+; CHECK-ORDERED-NEXT:    [[TMP55:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP54]], <vscale x 8 x float> [[TMP51]])
+; CHECK-ORDERED-NEXT:    [[TMP56:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP55]], <vscale x 8 x float> [[TMP52]])
+; CHECK-ORDERED-NEXT:    [[TMP57]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP56]], <vscale x 8 x float> [[TMP53]])
+; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-ORDERED-NEXT:    [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-ORDERED-NEXT:    br i1 [[TMP58]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-ORDERED:       middle.block:
@@ -2092,7 +2100,7 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED:       scalar.ph:
 ; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP57]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED:       for.body:
 ; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -2106,7 +2114,7 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP57]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    ret float [[MULADD_LCSSA]]
 ;
 ; CHECK-ORDERED-TF-LABEL: define float @fmuladd_strict_fmf
@@ -2122,114 +2130,116 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP87:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP88:%.*]] = mul i64 [[TMP87]], 32
 ; CHECK-ORDERED-TF-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 32
-; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 32
-; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = sub i64 [[N]], [[TMP11]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 32
-; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = sub i64 [[N]], [[TMP16]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = icmp ugt i64 [[N]], [[TMP16]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 32
-; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = sub i64 [[N]], [[TMP21]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = icmp ugt i64 [[N]], [[TMP21]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
-; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[TMP8]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[N]], [[TMP8]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = sub i64 [[N]], [[TMP13]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = icmp ugt i64 [[N]], [[TMP13]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = sub i64 [[N]], [[TMP18]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = icmp ugt i64 [[N]], [[TMP18]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[TMP19]], i64 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = sub i64 [[N]], [[TMP23]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = icmp ugt i64 [[N]], [[TMP23]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i64 [[TMP24]], i64 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 16
-; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
+; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP28]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 24
-; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 16
+; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP30]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 24
+; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP32]]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 8)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY6:%.*]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_ENTRY3]], i64 8)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY7:%.*]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_ENTRY3]], i64 0)
 ; CHECK-ORDERED-TF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       vector.body:
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK6:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP86:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP32]], 8
-; CHECK-ORDERED-TF-NEXT:    [[TMP34:%.*]] = add i64 [[TMP33]], 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 1
-; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = add i64 [[INDEX]], [[TMP35]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP38:%.*]] = mul i64 [[TMP37]], 16
-; CHECK-ORDERED-TF-NEXT:    [[TMP39:%.*]] = add i64 [[TMP38]], 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 1
-; CHECK-ORDERED-TF-NEXT:    [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 24
-; CHECK-ORDERED-TF-NEXT:    [[TMP44:%.*]] = add i64 [[TMP43]], 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 1
-; CHECK-ORDERED-TF-NEXT:    [[TMP46:%.*]] = add i64 [[INDEX]], [[TMP45]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP47:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP36]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP52:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP53:%.*]] = mul i64 [[TMP52]], 8
-; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP55:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP56:%.*]] = mul i64 [[TMP55]], 16
-; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP58]], 24
-; CHECK-ORDERED-TF-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP61:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP31]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP36]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP41]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP46]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP66:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP67:%.*]] = mul i64 [[TMP66]], 8
-; CHECK-ORDERED-TF-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP67]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP69:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP70:%.*]] = mul i64 [[TMP69]], 16
-; CHECK-ORDERED-TF-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP70]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP72:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP73:%.*]] = mul i64 [[TMP72]], 24
-; CHECK-ORDERED-TF-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP73]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP71]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP74]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP75:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP76:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP77:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY7]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY6]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK10:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP88:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = add i64 [[INDEX]], 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP37:%.*]] = mul i64 [[TMP36]], 1
+; CHECK-ORDERED-TF-NEXT:    [[TMP38:%.*]] = add i64 [[INDEX]], [[TMP37]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP41:%.*]] = add i64 [[TMP40]], 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 1
+; CHECK-ORDERED-TF-NEXT:    [[TMP43:%.*]] = add i64 [[INDEX]], [[TMP42]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP44:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP46:%.*]] = add i64 [[TMP45]], 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP47:%.*]] = mul i64 [[TMP46]], 1
+; CHECK-ORDERED-TF-NEXT:    [[TMP48:%.*]] = add i64 [[INDEX]], [[TMP47]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP33]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP38]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP43]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP48]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP53:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i32 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP54]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP55]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP57]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP58]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP60:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP61:%.*]] = mul i64 [[TMP60]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP49]], i64 [[TMP61]]
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP53]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP56]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP59]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP62]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK10]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP33]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP38]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP43]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP48]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP67:%.*]] = getelementptr inbounds float, ptr [[TMP63]], i32 0
+; CHECK-ORDERED-TF-NEXT:    [[TMP68:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP69:%.*]] = mul i64 [[TMP68]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP70:%.*]] = getelementptr inbounds float, ptr [[TMP63]], i64 [[TMP69]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP71:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP72:%.*]] = mul i64 [[TMP71]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP73:%.*]] = getelementptr inbounds float, ptr [[TMP63]], i64 [[TMP72]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP74:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP75:%.*]] = mul i64 [[TMP74]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP76:%.*]] = getelementptr inbounds float, ptr [[TMP63]], i64 [[TMP75]]
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP67]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP70]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP73]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD17:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP76]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK10]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP77:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD14]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP78:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP79:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[TMP75]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP80:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP79]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP81:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> [[TMP76]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP82:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP80]], <vscale x 8 x float> [[TMP81]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP83:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> [[TMP77]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP79:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD16]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP80:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD13]], [[WIDE_MASKED_LOAD17]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP81:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[TMP77]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP82:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP81]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP83:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[TMP78]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP84:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP82]], <vscale x 8 x float> [[TMP83]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP85:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[TMP78]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP86]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], <vscale x 8 x float> [[TMP85]])
-; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP88]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP85:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 8 x float> [[TMP79]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP86:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], <vscale x 8 x float> [[TMP85]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP87:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK10]], <vscale x 8 x float> [[TMP80]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP88]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP86]], <vscale x 8 x float> [[TMP87]])
+; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP89:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP90:%.*]] = mul i64 [[TMP89]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP91:%.*]] = add i64 [[INDEX]], [[TMP90]]
@@ -2239,21 +2249,23 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP95:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP96:%.*]] = mul i64 [[TMP95]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP97:%.*]] = add i64 [[INDEX]], [[TMP96]]
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP91]], i64 [[TMP14]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP94]], i64 [[TMP19]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP97]], i64 [[TMP24]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP98:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP99:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT16]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP100:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT17]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP101:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT18]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP94]], i64 [[TMP11]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT18:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT19]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 8)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT20]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT21]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT18]], i64 8)
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT22]] = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT18]], i64 0)
+; CHECK-ORDERED-TF-NEXT:    [[TMP98:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT22]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP99:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT21]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP100:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT20]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP101:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT19]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP102:%.*]] = extractelement <vscale x 8 x i1> [[TMP98]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP102]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED-TF:       scalar.ph:
 ; CHECK-ORDERED-TF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP88]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       for.body:
 ; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -2267,7 +2279,7 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP88]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    ret float [[MULADD_LCSSA]]
 ;
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
index 1a6e83a61ce74e6..1b7626fd617c57f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
@@ -18,102 +18,106 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP61:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP61]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[UMAX]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[UMAX]], [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 16
-; CHECK-NEXT:    [[TMP17:%.*]] = sub i64 [[UMAX]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp ugt i64 [[UMAX]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT:    [[TMP22:%.*]] = sub i64 [[UMAX]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp ugt i64 [[UMAX]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0
-; CHECK-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 4
-; CHECK-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 [[UMAX]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[UMAX]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i64 [[UMAX]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ugt i64 [[UMAX]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 16
+; CHECK-NEXT:    [[TMP19:%.*]] = sub i64 [[UMAX]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ugt i64 [[UMAX]], [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[TMP19]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 16
+; CHECK-NEXT:    [[TMP24:%.*]] = sub i64 [[UMAX]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ugt i64 [[UMAX]], [[TMP23]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i64 [[TMP24]], i64 0
 ; CHECK-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
-; CHECK-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]]
+; CHECK-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 4
+; CHECK-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP28]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 12
-; CHECK-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]])
+; CHECK-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 8
+; CHECK-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP30]]
+; CHECK-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 12
+; CHECK-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP32]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[UMAX]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 4)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY6:%.*]] = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY3]], i64 4)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY7:%.*]] = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY3]], i64 0)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX6]], 0
-; CHECK-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP32]], 4
-; CHECK-NEXT:    [[TMP34:%.*]] = add i64 [[TMP33]], 0
-; CHECK-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 1
-; CHECK-NEXT:    [[TMP36:%.*]] = add i64 [[INDEX6]], [[TMP35]]
-; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP38:%.*]] = mul i64 [[TMP37]], 8
-; CHECK-NEXT:    [[TMP39:%.*]] = add i64 [[TMP38]], 0
-; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 1
-; CHECK-NEXT:    [[TMP41:%.*]] = add i64 [[INDEX6]], [[TMP40]]
-; CHECK-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 12
-; CHECK-NEXT:    [[TMP44:%.*]] = add i64 [[TMP43]], 0
-; CHECK-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 1
-; CHECK-NEXT:    [[TMP46:%.*]] = add i64 [[INDEX6]], [[TMP45]]
-; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP31]]
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP36]]
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP41]]
-; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP46]]
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i32, ptr [[TMP47]], i32 0
-; CHECK-NEXT:    [[TMP52:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP53:%.*]] = mul i64 [[TMP52]], 4
-; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP53]]
-; CHECK-NEXT:    [[TMP55:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP56:%.*]] = mul i64 [[TMP55]], 8
-; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP56]]
-; CHECK-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP58]], 12
-; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP51]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP54]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP57]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP60]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]])
-; CHECK-NEXT:    [[INDEX_NEXT10]] = add i64 [[INDEX6]], [[TMP62]]
+; CHECK-NEXT:    [[INDEX8:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY7]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY6]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK10:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK11:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = add i64 [[INDEX8]], 0
+; CHECK-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 4
+; CHECK-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], 0
+; CHECK-NEXT:    [[TMP37:%.*]] = mul i64 [[TMP36]], 1
+; CHECK-NEXT:    [[TMP38:%.*]] = add i64 [[INDEX8]], [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 8
+; CHECK-NEXT:    [[TMP41:%.*]] = add i64 [[TMP40]], 0
+; CHECK-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 1
+; CHECK-NEXT:    [[TMP43:%.*]] = add i64 [[INDEX8]], [[TMP42]]
+; CHECK-NEXT:    [[TMP44:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 12
+; CHECK-NEXT:    [[TMP46:%.*]] = add i64 [[TMP45]], 0
+; CHECK-NEXT:    [[TMP47:%.*]] = mul i64 [[TMP46]], 1
+; CHECK-NEXT:    [[TMP48:%.*]] = add i64 [[INDEX8]], [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP33]]
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP38]]
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP43]]
+; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP48]]
+; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr i32, ptr [[TMP49]], i32 0
+; CHECK-NEXT:    [[TMP54:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP54]], 4
+; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP55]]
+; CHECK-NEXT:    [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP57]], 8
+; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP58]]
+; CHECK-NEXT:    [[TMP60:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP61:%.*]] = mul i64 [[TMP60]], 12
+; CHECK-NEXT:    [[TMP62:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP61]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP53]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP56]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP59]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK10]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP62]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK11]])
+; CHECK-NEXT:    [[INDEX_NEXT12]] = add i64 [[INDEX8]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP63:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP64:%.*]] = mul i64 [[TMP63]], 4
-; CHECK-NEXT:    [[TMP65:%.*]] = add i64 [[INDEX6]], [[TMP64]]
+; CHECK-NEXT:    [[TMP65:%.*]] = add i64 [[INDEX8]], [[TMP64]]
 ; CHECK-NEXT:    [[TMP66:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP67:%.*]] = mul i64 [[TMP66]], 8
-; CHECK-NEXT:    [[TMP68:%.*]] = add i64 [[INDEX6]], [[TMP67]]
+; CHECK-NEXT:    [[TMP68:%.*]] = add i64 [[INDEX8]], [[TMP67]]
 ; CHECK-NEXT:    [[TMP69:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP70:%.*]] = mul i64 [[TMP69]], 12
-; CHECK-NEXT:    [[TMP71:%.*]] = add i64 [[INDEX6]], [[TMP70]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP9]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT11]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP65]], i64 [[TMP14]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT12]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP68]], i64 [[TMP19]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT13]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP71]], i64 [[TMP24]])
-; CHECK-NEXT:    [[TMP72:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP73:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT11]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP74:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT12]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP75:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP71:%.*]] = add i64 [[INDEX8]], [[TMP70]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP68]], i64 [[TMP11]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT13:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX8]], i64 [[TMP11]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT14]] = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 4)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT15]] = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT13]], i64 4)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT13]], i64 0)
+; CHECK-NEXT:    [[TMP72:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT17]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP74:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT15]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP75:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT14]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <vscale x 4 x i1> [[TMP72]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP76]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
@@ -160,128 +164,132 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP83:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP84:%.*]] = mul i64 [[TMP83]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[UMAX]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[UMAX]], [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 16
-; CHECK-NEXT:    [[TMP17:%.*]] = sub i64 [[UMAX]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp ugt i64 [[UMAX]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT:    [[TMP22:%.*]] = sub i64 [[UMAX]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp ugt i64 [[UMAX]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0
-; CHECK-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 4
-; CHECK-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 [[UMAX]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[UMAX]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i64 [[UMAX]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ugt i64 [[UMAX]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 16
+; CHECK-NEXT:    [[TMP19:%.*]] = sub i64 [[UMAX]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ugt i64 [[UMAX]], [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[TMP19]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 16
+; CHECK-NEXT:    [[TMP24:%.*]] = sub i64 [[UMAX]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ugt i64 [[UMAX]], [[TMP23]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i64 [[TMP24]], i64 0
 ; CHECK-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
-; CHECK-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]]
+; CHECK-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 4
+; CHECK-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP28]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 12
-; CHECK-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]])
+; CHECK-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 8
+; CHECK-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP30]]
+; CHECK-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 12
+; CHECK-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP32]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[UMAX]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 4)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY6:%.*]] = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY3]], i64 4)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY7:%.*]] = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY3]], i64 0)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX6]], 0
-; CHECK-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP32]], 4
-; CHECK-NEXT:    [[TMP34:%.*]] = add i64 [[TMP33]], 0
-; CHECK-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 1
-; CHECK-NEXT:    [[TMP36:%.*]] = add i64 [[INDEX6]], [[TMP35]]
-; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP38:%.*]] = mul i64 [[TMP37]], 8
-; CHECK-NEXT:    [[TMP39:%.*]] = add i64 [[TMP38]], 0
-; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 1
-; CHECK-NEXT:    [[TMP41:%.*]] = add i64 [[INDEX6]], [[TMP40]]
-; CHECK-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 12
-; CHECK-NEXT:    [[TMP44:%.*]] = add i64 [[TMP43]], 0
-; CHECK-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 1
-; CHECK-NEXT:    [[TMP46:%.*]] = add i64 [[INDEX6]], [[TMP45]]
-; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr i32, ptr [[COND_PTR:%.*]], i64 [[TMP31]]
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP36]]
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP41]]
-; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP46]]
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i32, ptr [[TMP47]], i32 0
-; CHECK-NEXT:    [[TMP52:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP53:%.*]] = mul i64 [[TMP52]], 4
-; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP53]]
-; CHECK-NEXT:    [[TMP55:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP56:%.*]] = mul i64 [[TMP55]], 8
-; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP56]]
-; CHECK-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP58]], 12
-; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP51]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP54]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP57]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP60]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP62:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD10]], zeroinitializer
-; CHECK-NEXT:    [[TMP63:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD11]], zeroinitializer
+; CHECK-NEXT:    [[INDEX8:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY7]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY6]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK10:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK11:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = add i64 [[INDEX8]], 0
+; CHECK-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 4
+; CHECK-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], 0
+; CHECK-NEXT:    [[TMP37:%.*]] = mul i64 [[TMP36]], 1
+; CHECK-NEXT:    [[TMP38:%.*]] = add i64 [[INDEX8]], [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 8
+; CHECK-NEXT:    [[TMP41:%.*]] = add i64 [[TMP40]], 0
+; CHECK-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 1
+; CHECK-NEXT:    [[TMP43:%.*]] = add i64 [[INDEX8]], [[TMP42]]
+; CHECK-NEXT:    [[TMP44:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 12
+; CHECK-NEXT:    [[TMP46:%.*]] = add i64 [[TMP45]], 0
+; CHECK-NEXT:    [[TMP47:%.*]] = mul i64 [[TMP46]], 1
+; CHECK-NEXT:    [[TMP48:%.*]] = add i64 [[INDEX8]], [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr i32, ptr [[COND_PTR:%.*]], i64 [[TMP33]]
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP38]]
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP43]]
+; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP48]]
+; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr i32, ptr [[TMP49]], i32 0
+; CHECK-NEXT:    [[TMP54:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP54]], 4
+; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP55]]
+; CHECK-NEXT:    [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP57]], 8
+; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP58]]
+; CHECK-NEXT:    [[TMP60:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP61:%.*]] = mul i64 [[TMP60]], 12
+; CHECK-NEXT:    [[TMP62:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP61]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP53]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP56]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP59]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK10]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP62]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK11]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP63:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP64:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD12]], zeroinitializer
-; CHECK-NEXT:    [[TMP69:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP61]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP70:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 4 x i1> [[TMP62]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP71:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 4 x i1> [[TMP63]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP72:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 4 x i1> [[TMP64]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP31]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP36]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP41]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP46]]
-; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0
-; CHECK-NEXT:    [[TMP74:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP75:%.*]] = mul i64 [[TMP74]], 4
-; CHECK-NEXT:    [[TMP76:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP75]]
-; CHECK-NEXT:    [[TMP77:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP78:%.*]] = mul i64 [[TMP77]], 8
-; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP78]]
-; CHECK-NEXT:    [[TMP80:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP81:%.*]] = mul i64 [[TMP80]], 12
-; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP81]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP73]], i32 4, <vscale x 4 x i1> [[TMP69]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP76]], i32 4, <vscale x 4 x i1> [[TMP70]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP79]], i32 4, <vscale x 4 x i1> [[TMP71]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP82]], i32 4, <vscale x 4 x i1> [[TMP72]])
-; CHECK-NEXT:    [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP84]]
+; CHECK-NEXT:    [[TMP65:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; CHECK-NEXT:    [[TMP66:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; CHECK-NEXT:    [[TMP67:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP63]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP68:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 4 x i1> [[TMP64]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP69:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK10]], <vscale x 4 x i1> [[TMP65]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP70:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK11]], <vscale x 4 x i1> [[TMP66]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP33]]
+; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP38]]
+; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP43]]
+; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP48]]
+; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr i32, ptr [[TMP71]], i32 0
+; CHECK-NEXT:    [[TMP76:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP77:%.*]] = mul i64 [[TMP76]], 4
+; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr i32, ptr [[TMP71]], i64 [[TMP77]]
+; CHECK-NEXT:    [[TMP79:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP80:%.*]] = mul i64 [[TMP79]], 8
+; CHECK-NEXT:    [[TMP81:%.*]] = getelementptr i32, ptr [[TMP71]], i64 [[TMP80]]
+; CHECK-NEXT:    [[TMP82:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP83:%.*]] = mul i64 [[TMP82]], 12
+; CHECK-NEXT:    [[TMP84:%.*]] = getelementptr i32, ptr [[TMP71]], i64 [[TMP83]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP75]], i32 4, <vscale x 4 x i1> [[TMP67]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP78]], i32 4, <vscale x 4 x i1> [[TMP68]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP81]], i32 4, <vscale x 4 x i1> [[TMP69]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP84]], i32 4, <vscale x 4 x i1> [[TMP70]])
+; CHECK-NEXT:    [[INDEX_NEXT15]] = add i64 [[INDEX8]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP85:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP86:%.*]] = mul i64 [[TMP85]], 4
-; CHECK-NEXT:    [[TMP87:%.*]] = add i64 [[INDEX6]], [[TMP86]]
+; CHECK-NEXT:    [[TMP87:%.*]] = add i64 [[INDEX8]], [[TMP86]]
 ; CHECK-NEXT:    [[TMP88:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP89:%.*]] = mul i64 [[TMP88]], 8
-; CHECK-NEXT:    [[TMP90:%.*]] = add i64 [[INDEX6]], [[TMP89]]
+; CHECK-NEXT:    [[TMP90:%.*]] = add i64 [[INDEX8]], [[TMP89]]
 ; CHECK-NEXT:    [[TMP91:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP92:%.*]] = mul i64 [[TMP91]], 12
-; CHECK-NEXT:    [[TMP93:%.*]] = add i64 [[INDEX6]], [[TMP92]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP9]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT14]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP87]], i64 [[TMP14]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT15]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP90]], i64 [[TMP19]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP93]], i64 [[TMP24]])
-; CHECK-NEXT:    [[TMP94:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP95:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT14]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP96:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT15]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP97:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP93:%.*]] = add i64 [[INDEX8]], [[TMP92]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP90]], i64 [[TMP11]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT16:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX8]], i64 [[TMP11]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 4)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT19]] = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT16]], i64 4)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT20]] = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT16]], i64 0)
+; CHECK-NEXT:    [[TMP94:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT20]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP95:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT19]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP96:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT18]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP97:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT17]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <vscale x 4 x i1> [[TMP94]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP98]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
new file mode 100644
index 000000000000000..8b23489d6e42ee9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
@@ -0,0 +1,655 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes="default<O3>" -sve-tail-folding-insn-threshold=0 -force-vector-interleave=0                < %s | FileCheck %s -check-prefix CHECK-SVE-UF0
+; RUN: opt -S --passes="default<O3>" -sve-tail-folding-insn-threshold=0 -force-vector-interleave=2                < %s | FileCheck %s -check-prefix CHECK-SVE-UF2
+; RUN: opt -S --passes="default<O3>" -sve-tail-folding-insn-threshold=0 -force-vector-interleave=4                < %s | FileCheck %s -check-prefix CHECK-SVE-UF4
+; RUN: opt -S --passes="default<O3>" -sve-tail-folding-insn-threshold=0 -force-vector-interleave=0 -mattr=+sve2p1 < %s | FileCheck %s -check-prefix CHECK-SVE2p1-UF0
+; RUN: opt -S --passes="default<O3>" -sve-tail-folding-insn-threshold=0 -force-vector-interleave=2 -mattr=+sve2p1 < %s | FileCheck %s -check-prefix CHECK-SVE2p1-UF2
+; RUN: opt -S --passes="default<O3>" -sve-tail-folding-insn-threshold=0 -force-vector-interleave=4 -mattr=+sve2p1 < %s | FileCheck %s -check-prefix CHECK-SVE2p1-UF4
+
+target triple = "aarch64-unknown-linux"
+
+define void @f0(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
+; CHECK-SVE-UF0-LABEL: define void @f0(
+; CHECK-SVE-UF0-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-UF0-NEXT:  entry:
+; CHECK-SVE-UF0-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE-UF0-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF0:       for.body.preheader:
+; CHECK-SVE-UF0-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE-UF0-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF0-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF0-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF0:       vector.body:
+; CHECK-SVE-UF0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF0-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF0-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF0-NEXT:    [[TMP3:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-SVE-UF0-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF0-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP3]], ptr [[TMP4]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF0-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF0-NEXT:    [[TMP5:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-SVE-UF0-NEXT:    br i1 [[TMP5]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE-UF0:       for.cond.cleanup:
+; CHECK-SVE-UF0-NEXT:    ret void
+;
+; CHECK-SVE-UF2-LABEL: define void @f0(
+; CHECK-SVE-UF2-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-UF2-NEXT:  entry:
+; CHECK-SVE-UF2-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE-UF2-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF2:       for.body.preheader:
+; CHECK-SVE-UF2-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE-UF2-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 5
+; CHECK-SVE-UF2-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP3]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF2-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; CHECK-SVE-UF2-NEXT:    [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 4
+; CHECK-SVE-UF2-NEXT:    [[TMP8:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 4
+; CHECK-SVE-UF2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF2:       vector.body:
+; CHECK-SVE-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP5]]
+; CHECK-SVE-UF2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP10]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF2-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP11]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF2-NEXT:    [[TMP12:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-SVE-UF2-NEXT:    [[TMP13:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD3]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-SVE-UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 [[TMP7]]
+; CHECK-SVE-UF2-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP12]], ptr [[TMP14]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF2-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr [[TMP15]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK2]])
+; CHECK-SVE-UF2-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF2-NEXT:    [[INDEX_NEXT4]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF2-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX_NEXT]], [[TMP9]]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP16]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_NEXT5]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF2-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT5]], i64 0
+; CHECK-SVE-UF2-NEXT:    br i1 [[TMP17]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE-UF2:       for.cond.cleanup:
+; CHECK-SVE-UF2-NEXT:    ret void
+;
+; CHECK-SVE-UF4-LABEL: define void @f0(
+; CHECK-SVE-UF4-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-UF4-NEXT:  entry:
+; CHECK-SVE-UF4-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE-UF4-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF4:       for.body.preheader:
+; CHECK-SVE-UF4-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE-UF4-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 6
+; CHECK-SVE-UF4-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4
+; CHECK-SVE-UF4-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 5
+; CHECK-SVE-UF4-NEXT:    [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP7:%.*]] = mul nuw nsw i64 [[TMP6]], 48
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP7]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP5]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP3]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF4-NEXT:    [[TMP8:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 4
+; CHECK-SVE-UF4-NEXT:    [[TMP10:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 5
+; CHECK-SVE-UF4-NEXT:    [[TMP12:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP13:%.*]] = mul nuw nsw i64 [[TMP12]], 48
+; CHECK-SVE-UF4-NEXT:    [[TMP14:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 4
+; CHECK-SVE-UF4-NEXT:    [[TMP16:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 5
+; CHECK-SVE-UF4-NEXT:    [[TMP18:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP19:%.*]] = mul nuw nsw i64 [[TMP18]], 48
+; CHECK-SVE-UF4-NEXT:    [[TMP20:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP21:%.*]] = shl nuw nsw i64 [[TMP20]], 4
+; CHECK-SVE-UF4-NEXT:    [[TMP22:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP23:%.*]] = shl nuw nsw i64 [[TMP22]], 5
+; CHECK-SVE-UF4-NEXT:    [[TMP24:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP25:%.*]] = mul nuw nsw i64 [[TMP24]], 48
+; CHECK-SVE-UF4-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF4:       vector.body:
+; CHECK-SVE-UF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK6:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY3]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i64 [[TMP9]]
+; CHECK-SVE-UF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i64 [[TMP11]]
+; CHECK-SVE-UF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i64 [[TMP13]]
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP26]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP27]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP28]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP29]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF4-NEXT:    [[TMP30:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-SVE-UF4-NEXT:    [[TMP31:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD9]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-SVE-UF4-NEXT:    [[TMP32:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD10]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-SVE-UF4-NEXT:    [[TMP33:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD11]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-SVE-UF4-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF4-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i64 [[TMP15]]
+; CHECK-SVE-UF4-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i64 [[TMP17]]
+; CHECK-SVE-UF4-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i64 [[TMP19]]
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP30]], ptr [[TMP34]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP31]], ptr [[TMP35]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK6]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP32]], ptr [[TMP36]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK7]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP33]], ptr [[TMP37]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK8]])
+; CHECK-SVE-UF4-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF4-NEXT:    [[INDEX_NEXT14]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF4-NEXT:    [[TMP38:%.*]] = add i64 [[INDEX_NEXT]], [[TMP21]]
+; CHECK-SVE-UF4-NEXT:    [[TMP39:%.*]] = add i64 [[INDEX_NEXT]], [[TMP23]]
+; CHECK-SVE-UF4-NEXT:    [[TMP40:%.*]] = add i64 [[INDEX_NEXT]], [[TMP25]]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP40]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT15]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP39]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT16]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP38]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT17]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF4-NEXT:    [[TMP41:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT17]], i64 0
+; CHECK-SVE-UF4-NEXT:    br i1 [[TMP41]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE-UF4:       for.cond.cleanup:
+; CHECK-SVE-UF4-NEXT:    ret void
+;
+; CHECK-SVE2p1-UF0-LABEL: define void @f0(
+; CHECK-SVE2p1-UF0-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SVE2p1-UF0-NEXT:  entry:
+; CHECK-SVE2p1-UF0-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE2p1-UF0-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE2p1-UF0:       for.body.preheader:
+; CHECK-SVE2p1-UF0-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE2p1-UF0-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF0-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+; CHECK-SVE2p1-UF0-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE2p1-UF0-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE2p1-UF0:       vector.body:
+; CHECK-SVE2p1-UF0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF0-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF0-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE2p1-UF0-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-SVE2p1-UF0-NEXT:    [[TMP3:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-SVE2p1-UF0-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE2p1-UF0-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP3]], ptr [[TMP4]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE2p1-UF0-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE2p1-UF0-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE2p1-UF0-NEXT:    [[TMP5:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-SVE2p1-UF0-NEXT:    br i1 [[TMP5]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE2p1-UF0:       for.cond.cleanup:
+; CHECK-SVE2p1-UF0-NEXT:    ret void
+;
+; CHECK-SVE2p1-UF2-LABEL: define void @f0(
+; CHECK-SVE2p1-UF2-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SVE2p1-UF2-NEXT:  entry:
+; CHECK-SVE2p1-UF2-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE2p1-UF2-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE2p1-UF2:       for.body.preheader:
+; CHECK-SVE2p1-UF2-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 5
+; CHECK-SVE2p1-UF2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE2p1-UF2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 16)
+; CHECK-SVE2p1-UF2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY2:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; CHECK-SVE2p1-UF2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE2p1-UF2:       vector.body:
+; CHECK-SVE2p1-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF2-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY2]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF2-NEXT:    [[ACTIVE_LANE_MASK3:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP3]]
+; CHECK-SVE2p1-UF2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP6]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-SVE2p1-UF2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP7]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK3]], <vscale x 16 x i8> poison)
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP8:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD4]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP5]]
+; CHECK-SVE2p1-UF2-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP8]], ptr [[TMP10]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE2p1-UF2-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP9]], ptr [[TMP11]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK3]])
+; CHECK-SVE2p1-UF2-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE2p1-UF2-NEXT:    [[INDEX_NEXT5]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE2p1-UF2-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE2p1-UF2-NEXT:    [[ACTIVE_LANE_MASK_NEXT6]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 16)
+; CHECK-SVE2p1-UF2-NEXT:    [[ACTIVE_LANE_MASK_NEXT7]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT7]], i64 0
+; CHECK-SVE2p1-UF2-NEXT:    br i1 [[TMP12]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE2p1-UF2:       for.cond.cleanup:
+; CHECK-SVE2p1-UF2-NEXT:    ret void
+;
+; CHECK-SVE2p1-UF4-LABEL: define void @f0(
+; CHECK-SVE2p1-UF4-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SVE2p1-UF4-NEXT:  entry:
+; CHECK-SVE2p1-UF4-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE2p1-UF4-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE2p1-UF4:       for.body.preheader:
+; CHECK-SVE2p1-UF4-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 6
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 5
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 [[TMP3]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 16)
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY6:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY3]], i64 16)
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY7:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY3]], i64 0)
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 5
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP8:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP9:%.*]] = mul nuw nsw i64 [[TMP8]], 48
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP10:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 4
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP12:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 5
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP14:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP15:%.*]] = mul nuw nsw i64 [[TMP14]], 48
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP16:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 5
+; CHECK-SVE2p1-UF4-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE2p1-UF4:       vector.body:
+; CHECK-SVE2p1-UF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY7]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY6]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK10:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i64 [[TMP5]]
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i64 [[TMP7]]
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i64 [[TMP9]]
+; CHECK-SVE2p1-UF4-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-SVE2p1-UF4-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP19]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 16 x i8> poison)
+; CHECK-SVE2p1-UF4-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP20]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 16 x i8> poison)
+; CHECK-SVE2p1-UF4-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP21]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK10]], <vscale x 16 x i8> poison)
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP22:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP23:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD11]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP24:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD12]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP25:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD13]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i64 [[TMP11]]
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i64 [[TMP13]]
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i64 [[TMP15]]
+; CHECK-SVE2p1-UF4-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP22]], ptr [[TMP26]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE2p1-UF4-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP27]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK8]])
+; CHECK-SVE2p1-UF4-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP24]], ptr [[TMP28]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK9]])
+; CHECK-SVE2p1-UF4-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP25]], ptr [[TMP29]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK10]])
+; CHECK-SVE2p1-UF4-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE2p1-UF4-NEXT:    [[INDEX_NEXT16]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX_NEXT]], [[TMP17]]
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 [[TMP30]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT17:%.*]] = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT18]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 16)
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT19]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT20]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT17]], i64 16)
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT21]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT17]], i64 0)
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP31:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT21]], i64 0
+; CHECK-SVE2p1-UF4-NEXT:    br i1 [[TMP31]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE2p1-UF4:       for.cond.cleanup:
+; CHECK-SVE2p1-UF4-NEXT:    ret void
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %mul = mul i8 %0, 3
+  %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %mul, ptr %arrayidx3, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @f1(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
+; CHECK-SVE-UF0-LABEL: define void @f1(
+; CHECK-SVE-UF0-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-SVE-UF0-NEXT:  entry:
+; CHECK-SVE-UF0-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE-UF0-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF0:       for.body.preheader:
+; CHECK-SVE-UF0-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE-UF0-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF0-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF0-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF0:       vector.body:
+; CHECK-SVE-UF0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF0-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF0-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP2]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF0-NEXT:    [[TMP3:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-SVE-UF0-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF0-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP3]], ptr [[TMP4]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF0-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF0-NEXT:    [[TMP5:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-SVE-UF0-NEXT:    br i1 [[TMP5]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-SVE-UF0:       for.cond.cleanup:
+; CHECK-SVE-UF0-NEXT:    ret void
+;
+; CHECK-SVE-UF2-LABEL: define void @f1(
+; CHECK-SVE-UF2-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-SVE-UF2-NEXT:  entry:
+; CHECK-SVE-UF2-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE-UF2-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF2:       for.body.preheader:
+; CHECK-SVE-UF2-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE-UF2-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 2)
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY2:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-SVE-UF2-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1
+; CHECK-SVE-UF2-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 1
+; CHECK-SVE-UF2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF2:       vector.body:
+; CHECK-SVE-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY2]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK3:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP3]]
+; CHECK-SVE-UF2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP6]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK3]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF2-NEXT:    [[TMP8:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-SVE-UF2-NEXT:    [[TMP9:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD4]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-SVE-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[TMP10]], i64 [[TMP5]]
+; CHECK-SVE-UF2-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP8]], ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF2-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP9]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK3]])
+; CHECK-SVE-UF2-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF2-NEXT:    [[INDEX_NEXT5]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_NEXT6]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 2)
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_NEXT7]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-SVE-UF2-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT7]], i64 0
+; CHECK-SVE-UF2-NEXT:    br i1 [[TMP12]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-SVE-UF2:       for.cond.cleanup:
+; CHECK-SVE-UF2-NEXT:    ret void
+;
+; CHECK-SVE-UF4-LABEL: define void @f1(
+; CHECK-SVE-UF4-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-SVE-UF4-NEXT:  entry:
+; CHECK-SVE-UF4-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE-UF4-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF4:       for.body.preheader:
+; CHECK-SVE-UF4-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE-UF4-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-SVE-UF4-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP3]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 2)
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY6:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY3]], i64 2)
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY7:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY3]], i64 0)
+; CHECK-SVE-UF4-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 1
+; CHECK-SVE-UF4-NEXT:    [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
+; CHECK-SVE-UF4-NEXT:    [[TMP8:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP9:%.*]] = mul nuw nsw i64 [[TMP8]], 6
+; CHECK-SVE-UF4-NEXT:    [[TMP10:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 1
+; CHECK-SVE-UF4-NEXT:    [[TMP12:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2
+; CHECK-SVE-UF4-NEXT:    [[TMP14:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP15:%.*]] = mul nuw nsw i64 [[TMP14]], 6
+; CHECK-SVE-UF4-NEXT:    [[TMP16:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2
+; CHECK-SVE-UF4-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF4:       vector.body:
+; CHECK-SVE-UF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY7]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY6]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK10:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP18]], i64 [[TMP5]]
+; CHECK-SVE-UF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, ptr [[TMP18]], i64 [[TMP7]]
+; CHECK-SVE-UF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, ptr [[TMP18]], i64 [[TMP9]]
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP19]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP20]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP21]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK10]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF4-NEXT:    [[TMP22:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-SVE-UF4-NEXT:    [[TMP23:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD11]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-SVE-UF4-NEXT:    [[TMP24:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD12]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-SVE-UF4-NEXT:    [[TMP25:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD13]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-SVE-UF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, ptr [[TMP26]], i64 [[TMP11]]
+; CHECK-SVE-UF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, ptr [[TMP26]], i64 [[TMP13]]
+; CHECK-SVE-UF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds double, ptr [[TMP26]], i64 [[TMP15]]
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP22]], ptr [[TMP26]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP23]], ptr [[TMP27]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK8]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP24]], ptr [[TMP28]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK9]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP25]], ptr [[TMP29]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK10]])
+; CHECK-SVE-UF4-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF4-NEXT:    [[INDEX_NEXT16]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF4-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX_NEXT]], [[TMP17]]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP30]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT17:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT18]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 2)
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT19]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT20]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT17]], i64 2)
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT21]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT17]], i64 0)
+; CHECK-SVE-UF4-NEXT:    [[TMP31:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT21]], i64 0
+; CHECK-SVE-UF4-NEXT:    br i1 [[TMP31]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-SVE-UF4:       for.cond.cleanup:
+; CHECK-SVE-UF4-NEXT:    ret void
+;
+; CHECK-SVE2p1-UF0-LABEL: define void @f1(
+; CHECK-SVE2p1-UF0-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-SVE2p1-UF0-NEXT:  entry:
+; CHECK-SVE2p1-UF0-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE2p1-UF0-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE2p1-UF0:       for.body.preheader:
+; CHECK-SVE2p1-UF0-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE2p1-UF0-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF0-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-SVE2p1-UF0-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE2p1-UF0-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE2p1-UF0:       vector.body:
+; CHECK-SVE2p1-UF0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF0-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF0-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE2p1-UF0-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP2]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-SVE2p1-UF0-NEXT:    [[TMP3:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-SVE2p1-UF0-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE2p1-UF0-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP3]], ptr [[TMP4]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE2p1-UF0-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE2p1-UF0-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE2p1-UF0-NEXT:    [[TMP5:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-SVE2p1-UF0-NEXT:    br i1 [[TMP5]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-SVE2p1-UF0:       for.cond.cleanup:
+; CHECK-SVE2p1-UF0-NEXT:    ret void
+;
+; CHECK-SVE2p1-UF2-LABEL: define void @f1(
+; CHECK-SVE2p1-UF2-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-SVE2p1-UF2-NEXT:  entry:
+; CHECK-SVE2p1-UF2-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE2p1-UF2-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE2p1-UF2:       for.body.preheader:
+; CHECK-SVE2p1-UF2-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-SVE2p1-UF2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE2p1-UF2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 2)
+; CHECK-SVE2p1-UF2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY2:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 1
+; CHECK-SVE2p1-UF2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE2p1-UF2:       vector.body:
+; CHECK-SVE2p1-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF2-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY2]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF2-NEXT:    [[ACTIVE_LANE_MASK3:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP3]]
+; CHECK-SVE2p1-UF2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP6]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-SVE2p1-UF2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK3]], <vscale x 2 x double> poison)
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP8:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP9:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD4]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[TMP10]], i64 [[TMP5]]
+; CHECK-SVE2p1-UF2-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP8]], ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE2p1-UF2-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP9]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK3]])
+; CHECK-SVE2p1-UF2-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE2p1-UF2-NEXT:    [[INDEX_NEXT5]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE2p1-UF2-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE2p1-UF2-NEXT:    [[ACTIVE_LANE_MASK_NEXT6]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 2)
+; CHECK-SVE2p1-UF2-NEXT:    [[ACTIVE_LANE_MASK_NEXT7]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-SVE2p1-UF2-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT7]], i64 0
+; CHECK-SVE2p1-UF2-NEXT:    br i1 [[TMP12]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-SVE2p1-UF2:       for.cond.cleanup:
+; CHECK-SVE2p1-UF2-NEXT:    ret void
+;
+; CHECK-SVE2p1-UF4-LABEL: define void @f1(
+; CHECK-SVE2p1-UF4-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-SVE2p1-UF4-NEXT:  entry:
+; CHECK-SVE2p1-UF4-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE2p1-UF4-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE2p1-UF4:       for.body.preheader:
+; CHECK-SVE2p1-UF4-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP3]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 2)
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY6:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY3]], i64 2)
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY7:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY3]], i64 0)
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 1
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP8:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP9:%.*]] = mul nuw nsw i64 [[TMP8]], 6
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP10:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 1
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP12:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP14:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP15:%.*]] = mul nuw nsw i64 [[TMP14]], 6
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP16:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2
+; CHECK-SVE2p1-UF4-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE2p1-UF4:       vector.body:
+; CHECK-SVE2p1-UF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY7]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY6]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK10:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP18]], i64 [[TMP5]]
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, ptr [[TMP18]], i64 [[TMP7]]
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, ptr [[TMP18]], i64 [[TMP9]]
+; CHECK-SVE2p1-UF4-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-SVE2p1-UF4-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP19]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 2 x double> poison)
+; CHECK-SVE2p1-UF4-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP20]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 2 x double> poison)
+; CHECK-SVE2p1-UF4-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP21]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK10]], <vscale x 2 x double> poison)
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP22:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP23:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD11]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP24:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD12]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP25:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD13]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, ptr [[TMP26]], i64 [[TMP11]]
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, ptr [[TMP26]], i64 [[TMP13]]
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds double, ptr [[TMP26]], i64 [[TMP15]]
+; CHECK-SVE2p1-UF4-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP22]], ptr [[TMP26]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE2p1-UF4-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP23]], ptr [[TMP27]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK8]])
+; CHECK-SVE2p1-UF4-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP24]], ptr [[TMP28]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK9]])
+; CHECK-SVE2p1-UF4-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP25]], ptr [[TMP29]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK10]])
+; CHECK-SVE2p1-UF4-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE2p1-UF4-NEXT:    [[INDEX_NEXT16]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX_NEXT]], [[TMP17]]
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP30]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT17:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT18]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 2)
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT19]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT20]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT17]], i64 2)
+; CHECK-SVE2p1-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT21]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT17]], i64 0)
+; CHECK-SVE2p1-UF4-NEXT:    [[TMP31:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT21]], i64 0
+; CHECK-SVE2p1-UF4-NEXT:    br i1 [[TMP31]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-SVE2p1-UF4:       for.cond.cleanup:
+; CHECK-SVE2p1-UF4-NEXT:    ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %src, i64 %indvars.iv
+  %0 = load double, ptr %arrayidx, align 8
+  %mul = fmul double %0, 3.000000e+00
+  %arrayidx2 = getelementptr inbounds double, ptr %dst, i64 %indvars.iv
+  store double %mul, ptr %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = {nounwind vscale_range(1,16) "target-cpu"="neoverse-v1" }
+;.
+; CHECK-SVE-UF0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-SVE-UF0: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-SVE-UF0: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-SVE-UF0: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+;.
+; CHECK-SVE-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-SVE-UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-SVE-UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-SVE-UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+;.
+; CHECK-SVE-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-SVE-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-SVE-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-SVE-UF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+;.
+; CHECK-SVE2p1-UF0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-SVE2p1-UF0: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-SVE2p1-UF0: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-SVE2p1-UF0: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+;.
+; CHECK-SVE2p1-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-SVE2p1-UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-SVE2p1-UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-SVE2p1-UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+;.
+; CHECK-SVE2p1-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-SVE2p1-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-SVE2p1-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-SVE2p1-UF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll
index 8f3cfc5b20ee9e6..c9645646ad0acb6 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll
@@ -56,10 +56,10 @@ define dso_local void @interleave4(ptr noalias nocapture %A, ptr noalias nocaptu
 ; PREDFLAG:  %[[ADD2:.*]] = add i32 %index, 4
 ; PREDFLAG:  %[[ADD3:.*]] = add i32 %index, 8
 ; PREDFLAG:  %[[ADD4:.*]] = add i32 %index, 12
-; PREDFLAG:  %[[ALM1:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD1]], i32 %N)
-; PREDFLAG:  %[[ALM2:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD2]], i32 %N)
-; PREDFLAG:  %[[ALM3:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD3]], i32 %N)
 ; PREDFLAG:  %[[ALM4:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD4]], i32 %N)
+; PREDFLAG:  %[[ALM3:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD3]], i32 %N)
+; PREDFLAG:  %[[ALM2:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD2]], i32 %N)
+; PREDFLAG:  %[[ALM1:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD1]], i32 %N)
 ;
 ; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0({{.*}}, <4 x i1> %[[ALM1]],{{.*}}
 ; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0({{.*}}, <4 x i1> %[[ALM2]],{{.*}}
diff --git a/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll
index 452fa9fa8c8e5f8..9584b3dfb2587ac 100644
--- a/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll
@@ -49,12 +49,12 @@ define float @pr70988() {
 ; CHECK-ALM-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ALM-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX1]], 0
 ; CHECK-ALM-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX1]], 1
-; CHECK-ALM-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i32(i32 [[TMP0]], i32 1021)
-; CHECK-ALM-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i32(i32 [[TMP1]], i32 1021)
-; CHECK-ALM-NEXT:    [[TMP2:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; CHECK-ALM-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i32(i32 [[TMP1]], i32 1021)
+; CHECK-ALM-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i32(i32 [[TMP0]], i32 1021)
+; CHECK-ALM-NEXT:    [[TMP2:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK2]], i32 0
 ; CHECK-ALM-NEXT:    [[TMP3:%.*]] = select contract i1 [[TMP2]], float 1.000000e+00, float -0.000000e+00
 ; CHECK-ALM-NEXT:    [[TMP4:%.*]] = fadd contract float [[VEC_PHI]], [[TMP3]]
-; CHECK-ALM-NEXT:    [[TMP5:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK2]], i32 0
+; CHECK-ALM-NEXT:    [[TMP5:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK]], i32 0
 ; CHECK-ALM-NEXT:    [[TMP6:%.*]] = select contract i1 [[TMP5]], float 1.000000e+00, float -0.000000e+00
 ; CHECK-ALM-NEXT:    [[TMP7]] = fadd contract float [[TMP4]], [[TMP6]]
 ; CHECK-ALM-NEXT:    [[INDEX_NEXT3]] = add i32 [[INDEX1]], 2
@@ -160,9 +160,9 @@ define float @pr72720reduction_using_active_lane_mask(ptr %src) {
 ; CHECK-ALM-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[PRED_LOAD_CONTINUE3]] ]
 ; CHECK-ALM-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-ALM-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
-; CHECK-ALM-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i32(i32 [[TMP0]], i32 15)
-; CHECK-ALM-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i32(i32 [[TMP1]], i32 15)
-; CHECK-ALM-NEXT:    [[TMP2:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; CHECK-ALM-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i32(i32 [[TMP1]], i32 15)
+; CHECK-ALM-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i32(i32 [[TMP0]], i32 15)
+; CHECK-ALM-NEXT:    [[TMP2:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK1]], i32 0
 ; CHECK-ALM-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK-ALM:       pred.load.if:
 ; CHECK-ALM-NEXT:    [[TMP3:%.*]] = getelementptr float, ptr [[SRC]], i32 [[TMP0]]
@@ -170,7 +170,7 @@ define float @pr72720reduction_using_active_lane_mask(ptr %src) {
 ; CHECK-ALM-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK-ALM:       pred.load.continue:
 ; CHECK-ALM-NEXT:    [[TMP5:%.*]] = phi float [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ]
-; CHECK-ALM-NEXT:    [[TMP6:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK1]], i32 0
+; CHECK-ALM-NEXT:    [[TMP6:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK]], i32 0
 ; CHECK-ALM-NEXT:    br i1 [[TMP6]], label [[PRED_LOAD_IF2:%.*]], label [[PRED_LOAD_CONTINUE3]]
 ; CHECK-ALM:       pred.load.if2:
 ; CHECK-ALM-NEXT:    [[TMP7:%.*]] = getelementptr float, ptr [[SRC]], i32 [[TMP1]]
@@ -178,10 +178,10 @@ define float @pr72720reduction_using_active_lane_mask(ptr %src) {
 ; CHECK-ALM-NEXT:    br label [[PRED_LOAD_CONTINUE3]]
 ; CHECK-ALM:       pred.load.continue3:
 ; CHECK-ALM-NEXT:    [[TMP9:%.*]] = phi float [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP8]], [[PRED_LOAD_IF2]] ]
-; CHECK-ALM-NEXT:    [[TMP10:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; CHECK-ALM-NEXT:    [[TMP10:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK1]], i32 0
 ; CHECK-ALM-NEXT:    [[TMP11:%.*]] = select contract i1 [[TMP10]], float [[TMP5]], float -0.000000e+00
 ; CHECK-ALM-NEXT:    [[TMP12:%.*]] = fadd contract float [[VEC_PHI]], [[TMP11]]
-; CHECK-ALM-NEXT:    [[TMP13:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK1]], i32 0
+; CHECK-ALM-NEXT:    [[TMP13:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK]], i32 0
 ; CHECK-ALM-NEXT:    [[TMP14:%.*]] = select contract i1 [[TMP13]], float [[TMP9]], float -0.000000e+00
 ; CHECK-ALM-NEXT:    [[TMP15]] = fadd contract float [[TMP12]], [[TMP14]]
 ; CHECK-ALM-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2

>From ce2b80f3bf055a3b1de630657e89f173f5e4c12b Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Thu, 28 Dec 2023 16:15:11 +0000
Subject: [PATCH 3/3] [AArch64] Optimise test of the LSB of a paired whileCC
 insntruction

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 75 ++++++++++++-------
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 15 +++-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    | 12 +--
 .../CodeGen/AArch64/sve-wide-lane-mask.ll     | 42 ++++-------
 4 files changed, 84 insertions(+), 60 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 64a6aa681ade9f6..5eef6e618f120a6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17800,22 +17800,49 @@ static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
                         AArch64CC::CondCode Cond);
 
-static bool isPredicateCCSettingOp(SDValue N) {
-  if ((N.getOpcode() == ISD::SETCC) ||
-      (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
-       (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
-        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
-        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
-        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
-        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
-        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
-        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
-        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
-        // get_active_lane_mask is lowered to a whilelo instruction.
-        N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
-    return true;
+static SDValue getPredicateCCSettingOp(SDValue N) {
+  if (N.getOpcode() == ISD::SETCC) {
+    EVT VT = N.getValueType();
+    return VT.isScalableVector() && VT.getVectorElementType() == MVT::i1
+               ? N
+               : SDValue();
+  }
 
-  return false;
+  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      isNullConstant(N.getOperand(1)))
+    N = N.getOperand(0);
+
+  if (N.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
+    return SDValue();
+
+  switch (N.getConstantOperandVal(0)) {
+  default:
+    return SDValue();
+  case Intrinsic::aarch64_sve_whilege_x2:
+  case Intrinsic::aarch64_sve_whilegt_x2:
+  case Intrinsic::aarch64_sve_whilehi_x2:
+  case Intrinsic::aarch64_sve_whilehs_x2:
+  case Intrinsic::aarch64_sve_whilele_x2:
+  case Intrinsic::aarch64_sve_whilelo_x2:
+  case Intrinsic::aarch64_sve_whilels_x2:
+  case Intrinsic::aarch64_sve_whilelt_x2:
+    if (N.getResNo() != 0)
+      return SDValue();
+    [[fallthrough]];
+  case Intrinsic::aarch64_sve_whilege:
+  case Intrinsic::aarch64_sve_whilegt:
+  case Intrinsic::aarch64_sve_whilehi:
+  case Intrinsic::aarch64_sve_whilehs:
+  case Intrinsic::aarch64_sve_whilele:
+  case Intrinsic::aarch64_sve_whilelo:
+  case Intrinsic::aarch64_sve_whilels:
+  case Intrinsic::aarch64_sve_whilelt:
+  case Intrinsic::get_active_lane_mask:
+    assert(N.getValueType().isScalableVector() &&
+           N.getValueType().getVectorElementType() == MVT::i1 &&
+           "Intrinsic expected to yield scalable i1 vector");
+    return N;
+  }
 }
 
 // Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
@@ -17829,21 +17856,17 @@ performFirstTrueTestVectorCombine(SDNode *N,
   if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
     return SDValue();
 
-  SDValue N0 = N->getOperand(0);
-  EVT VT = N0.getValueType();
-
-  if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
-      !isNullConstant(N->getOperand(1)))
-    return SDValue();
-
-  // Restricted the DAG combine to only cases where we're extracting from a
-  // flag-setting operation.
-  if (!isPredicateCCSettingOp(N0))
+  // Restrict the DAG combine to only cases where we're extracting the zero-th
+  // element from the result of a flag-setting operation.
+  SDValue N0;
+  if (!isNullConstant(N->getOperand(1)) ||
+      !(N0 = getPredicateCCSettingOp(N->getOperand(0))))
     return SDValue();
 
   // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
   SelectionDAG &DAG = DCI.DAG;
-  SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
+  SDValue Pg =
+      getPTrue(DAG, SDLoc(N), N0.getValueType(), AArch64SVEPredPattern::all);
   return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 42b7a6418032adc..175ae0413875543 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1356,11 +1356,22 @@ bool AArch64InstrInfo::optimizePTestInstr(
     const MachineRegisterInfo *MRI) const {
   auto *Mask = MRI->getUniqueVRegDef(MaskReg);
   auto *Pred = MRI->getUniqueVRegDef(PredReg);
-  auto NewOp = Pred->getOpcode();
+  unsigned NewOp;
   bool OpChanged = false;
 
   unsigned MaskOpcode = Mask->getOpcode();
   unsigned PredOpcode = Pred->getOpcode();
+
+  // Handle a COPY from the LSB of a paired WHILEcc instruction.
+  if ((PredOpcode == TargetOpcode::COPY &&
+       Pred->getOperand(1).getSubReg() == AArch64::psub0)) {
+    MachineInstr *MI = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
+    if (MI && isWhileOpcode(MI->getOpcode())) {
+      Pred = MI;
+      PredOpcode = MI->getOpcode();
+    }
+  }
+
   bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
   bool PredIsWhileLike = isWhileOpcode(PredOpcode);
 
@@ -1476,9 +1487,9 @@ bool AArch64InstrInfo::optimizePTestInstr(
   // as they are prior to PTEST. Sometimes this requires the tested PTEST
   // operand to be replaced with an equivalent instruction that also sets the
   // flags.
-  Pred->setDesc(get(NewOp));
   PTest->eraseFromParent();
   if (OpChanged) {
+    Pred->setDesc(get(NewOp));
     bool succeeded = UpdateOperandRegClass(*Pred);
     (void)succeeded;
     assert(succeeded && "Operands have incompatible register classes!");
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 789ec817d3d8b89..718b245c6d8290a 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -9754,7 +9754,7 @@ multiclass sve2p1_int_while_rr_pn<string mnemonic, bits<3> opc> {
 
 // SVE integer compare scalar count and limit (predicate pair)
 class sve2p1_int_while_rr_pair<string mnemonic, bits<2> sz, bits<3> opc,
-                             RegisterOperand ppr_ty>
+                             RegisterOperand ppr_ty, ElementSizeEnum EltSz>
     : I<(outs ppr_ty:$Pd), (ins GPR64:$Rn, GPR64:$Rm),
         mnemonic, "\t$Pd, $Rn, $Rm",
         "", []>, Sched<[]> {
@@ -9772,16 +9772,18 @@ class sve2p1_int_while_rr_pair<string mnemonic, bits<2> sz, bits<3> opc,
   let Inst{3-1}   = Pd;
   let Inst{0}     = opc{0};
 
+  let ElementSize = EltSz;
   let Defs = [NZCV];
   let hasSideEffects = 0;
+  let isWhile = 1;
 }
 
 
 multiclass sve2p1_int_while_rr_pair<string mnemonic, bits<3> opc> {
- def _B : sve2p1_int_while_rr_pair<mnemonic, 0b00, opc, PP_b_mul_r>;
- def _H : sve2p1_int_while_rr_pair<mnemonic, 0b01, opc, PP_h_mul_r>;
- def _S : sve2p1_int_while_rr_pair<mnemonic, 0b10, opc, PP_s_mul_r>;
- def _D : sve2p1_int_while_rr_pair<mnemonic, 0b11, opc, PP_d_mul_r>;
+ def _B : sve2p1_int_while_rr_pair<mnemonic, 0b00, opc, PP_b_mul_r, ElementSizeB>;
+ def _H : sve2p1_int_while_rr_pair<mnemonic, 0b01, opc, PP_h_mul_r, ElementSizeH>;
+ def _S : sve2p1_int_while_rr_pair<mnemonic, 0b10, opc, PP_s_mul_r, ElementSizeS>;
+ def _D : sve2p1_int_while_rr_pair<mnemonic, 0b11, opc, PP_d_mul_r, ElementSizeD>;
 }
 
 
diff --git a/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll b/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll
index 430b349291d1a31..4269e1c559e4044 100644
--- a/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll
+++ b/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll
@@ -212,9 +212,7 @@ define void @f3(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
 ; CHECK-NEXT:    whilelo p1.s, x8, x9
 ; CHECK-NEXT:    punpkhi p0.h, p1.b
 ; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    mov z1.d, p1/z, #1 // =0x1
-; CHECK-NEXT:    fmov x13, d1
-; CHECK-NEXT:    tbnz w13, #0, .LBB3_2
+; CHECK-NEXT:    b.mi .LBB3_2
 ; CHECK-NEXT:  .LBB3_3: // %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -431,21 +429,19 @@ define void @f5(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
 ; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x13, x9, lsl #3]
 ; CHECK-NEXT:    fmul z2.d, z2.d, z0.d
 ; CHECK-NEXT:    fmul z3.d, z3.d, z0.d
-; CHECK-NEXT:    fmul z4.d, z4.d, z0.d
 ; CHECK-NEXT:    st1d { z1.d }, p3, [x0, x9, lsl #3]
+; CHECK-NEXT:    fmul z1.d, z4.d, z0.d
 ; CHECK-NEXT:    st1d { z2.d }, p2, [x12, x9, lsl #3]
 ; CHECK-NEXT:    st1d { z3.d }, p1, [x11, x9, lsl #3]
-; CHECK-NEXT:    st1d { z4.d }, p0, [x10, x9, lsl #3]
+; CHECK-NEXT:    whilelo p3.s, x18, x8
+; CHECK-NEXT:    st1d { z1.d }, p0, [x10, x9, lsl #3]
 ; CHECK-NEXT:    add x9, x16, x9
-; CHECK-NEXT:    whilelo p1.s, x18, x8
-; CHECK-NEXT:    whilelo p3.s, x9, x8
-; CHECK-NEXT:    punpkhi p0.h, p1.b
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    punpkhi p2.h, p3.b
-; CHECK-NEXT:    punpklo p3.h, p3.b
-; CHECK-NEXT:    mov z1.d, p3/z, #1 // =0x1
-; CHECK-NEXT:    fmov x18, d1
-; CHECK-NEXT:    tbnz w18, #0, .LBB5_2
+; CHECK-NEXT:    punpkhi p0.h, p3.b
+; CHECK-NEXT:    punpklo p1.h, p3.b
+; CHECK-NEXT:    whilelo p4.s, x9, x8
+; CHECK-NEXT:    punpkhi p2.h, p4.b
+; CHECK-NEXT:    punpklo p3.h, p4.b
+; CHECK-NEXT:    b.mi .LBB5_2
 ; CHECK-NEXT:  .LBB5_3: // %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -644,9 +640,7 @@ define void @f8(ptr noalias %dst, ptr readonly %src, i32 %n) #1 {
 ; CHECK-NEXT:    st1b { z1.b }, p1, [x10, x8]
 ; CHECK-NEXT:    addvl x8, x8, #2
 ; CHECK-NEXT:    whilelo { p0.b, p1.b }, x8, x9
-; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
-; CHECK-NEXT:    fmov w12, s0
-; CHECK-NEXT:    tbnz w12, #0, .LBB8_2
+; CHECK-NEXT:    b.mi .LBB8_2
 ; CHECK-NEXT:  .LBB8_3: // %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -716,9 +710,7 @@ define void @f9(ptr noalias %dst, ptr readonly %src, i32 %n) #1 {
 ; CHECK-NEXT:    st1d { z2.d }, p1, [x10, x8, lsl #3]
 ; CHECK-NEXT:    incw x8
 ; CHECK-NEXT:    whilelo { p0.d, p1.d }, x8, x9
-; CHECK-NEXT:    mov z1.d, p0/z, #1 // =0x1
-; CHECK-NEXT:    fmov x12, d1
-; CHECK-NEXT:    tbnz w12, #0, .LBB9_2
+; CHECK-NEXT:    b.mi .LBB9_2
 ; CHECK-NEXT:  .LBB9_3: // %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -801,9 +793,7 @@ define void @f10(ptr noalias %dst, ptr readonly %src, i32 %n) #1 {
 ; CHECK-NEXT:    addvl x9, x9, #4
 ; CHECK-NEXT:    whilelo { p0.b, p1.b }, x16, x8
 ; CHECK-NEXT:    whilelo { p2.b, p3.b }, x9, x8
-; CHECK-NEXT:    mov z0.b, p2/z, #1 // =0x1
-; CHECK-NEXT:    fmov w16, s0
-; CHECK-NEXT:    tbnz w16, #0, .LBB10_2
+; CHECK-NEXT:    b.mi .LBB10_2
 ; CHECK-NEXT:  .LBB10_3: // %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -914,15 +904,13 @@ define void @f11(ptr noalias %dst, ptr readonly %src, i32 %n) #1 {
 ; CHECK-NEXT:    fmul z4.d, z4.d, z0.d
 ; CHECK-NEXT:    st1d { z1.d }, p2, [x0, x9, lsl #3]
 ; CHECK-NEXT:    st1d { z2.d }, p3, [x12, x9, lsl #3]
-; CHECK-NEXT:    whilelo { p2.d, p3.d }, x15, x8
-; CHECK-NEXT:    mov z1.d, p2/z, #1 // =0x1
 ; CHECK-NEXT:    st1d { z3.d }, p0, [x11, x9, lsl #3]
 ; CHECK-NEXT:    st1d { z4.d }, p1, [x10, x9, lsl #3]
 ; CHECK-NEXT:    incw x9, all, mul #3
 ; CHECK-NEXT:    whilelo { p0.d, p1.d }, x9, x8
+; CHECK-NEXT:    whilelo { p2.d, p3.d }, x15, x8
 ; CHECK-NEXT:    mov x9, x15
-; CHECK-NEXT:    fmov x17, d1
-; CHECK-NEXT:    tbnz w17, #0, .LBB11_2
+; CHECK-NEXT:    b.mi .LBB11_2
 ; CHECK-NEXT:  .LBB11_3: // %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:



More information about the llvm-commits mailing list