[llvm] [LV] Mask off possibly aliasing vector lanes (PR #100579)

Fri Oct 25 07:59:43 PDT 2024

https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/100579

>From 96b8b65968c9f8b6665b001b7ee92da6164e8bf6 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 18 Oct 2024 15:49:22 +0100
Subject: [PATCH 1/5] [LV] Mask off possibly aliasing vector lanes

    When vectorising a loop that uses loads and stores, those pointers could
    overlap if their difference is less than the vector factor. For example,
    if address 20 is being stored to and address 23 is being loaded from, they
    overlap when the vector factor is 4 or higher. Currently LoopVectorize
    branches to a scalar loop in these cases with a runtime check. Howver if
    we construct a mask that disables the overlapping (aliasing) lanes then
    the vectorised loop can be safely entered, as long as the loads and
    stores are masked off.
---
 .../llvm/Analysis/TargetTransformInfo.h       |   7 ++
 .../Vectorize/LoopVectorizationPlanner.h      |  22 +++-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  89 +++++++++----
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   1 -
 llvm/lib/Transforms/Vectorize/VPlan.h         |  48 +++++++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  64 +++++++++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  68 ++++++++--
 .../Transforms/Vectorize/VPlanTransforms.h    |   6 +-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   1 +
 .../LoopVectorize/AArch64/alias_mask.ll       | 117 ++++++++++++++++++
 .../AArch64/induction-costs-sve.ll            |  25 +++-
 .../runtime-check-small-clamped-bounds.ll     |  22 ++--
 .../runtime-checks-difference.ll              |  62 +++++-----
 13 files changed, 443 insertions(+), 89 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0459941fe05cdc..860db629d2358a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -194,6 +194,13 @@ enum class TailFoldingStyle {
   DataWithEVL,
 };
 
+enum class RTCheckStyle {
+  /// Branch to scalar loop if checks fails at runtime.
+  ScalarFallback,
+  /// Form a mask based on elements which won't be a WAR or RAW hazard
+  UseSafeEltsMask,
+};
+
 struct TailFoldingInfo {
   TargetLibraryInfo *TLI;
   LoopVectorizationLegality *LVL;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 1c8d541ef2c51f..4d3a54526d7d56 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -418,7 +418,13 @@ class LoopVectorizationPlanner {
   /// Build VPlans for the specified \p UserVF and \p UserIC if they are
   /// non-zero or all applicable candidate VFs otherwise. If vectorization and
   /// interleaving should be avoided up-front, no plans are generated.
-  void plan(ElementCount UserVF, unsigned UserIC);
+  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
+  /// setting HasAliasMask to true in the case that an alias mask is generated
+  /// and the vector loop should be entered even if the pointers alias across a
+  /// loop iteration.
+  void plan(ElementCount UserVF, unsigned UserIC,
+            std::optional<ArrayRef<PointerDiffInfo>> DiffChecks,
+            bool &HasAliasMask);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
@@ -495,12 +501,22 @@ class LoopVectorizationPlanner {
   /// returned VPlan is valid for. If no VPlan can be built for the input range,
   /// set the largest included VF to the maximum VF for which no plan could be
   /// built.
-  VPlanPtr tryToBuildVPlanWithVPRecipes(VFRange &Range);
+  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
+  /// setting HasAliasMask to true in the case that an alias mask is generated
+  /// and the vector loop should be entered even if the pointers alias across a
+  /// loop iteration.
+  VPlanPtr tryToBuildVPlanWithVPRecipes(VFRange &Range,
+                                        ArrayRef<PointerDiffInfo> RTChecks,
+                                        bool &HasAliasMask);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
-  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
+  /// RTChecks contains a list of pointer pairs that an alias mask should be
+  /// generated for.
+  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF,
+                                ArrayRef<PointerDiffInfo> RTChecks,
+                                bool &HasAliasMask);
 
   // Adjust the recipes for reductions. For in-loop reductions the chain of
   // instructions leading from the loop exit instr to the phi need to be
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e8653498d32a12..6ca5581683e824 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -173,6 +173,7 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
+STATISTIC(LoopsAliasMasked, "Number of loops predicated with an alias mask");
 
 static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -1806,6 +1807,10 @@ class GeneratedRTChecks {
   PredicatedScalarEvolution &PSE;
 
 public:
+  /// Set by VPlan when the vector loop should be entered even when runtime
+  /// checks determine that pointers alias within an iteration.
+  bool HasAliasMask = false;
+
   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
                     LoopInfo *LI, TargetTransformInfo *TTI,
                     const DataLayout &DL, bool AddBranchWeights)
@@ -1847,9 +1852,11 @@ class GeneratedRTChecks {
 
     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
     if (RtPtrChecking.Need) {
-      auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
-      MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
-                                 "vector.memcheck");
+      if (!MemCheckBlock) {
+        auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
+        MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
+                                   "vector.memcheck");
+      }
 
       auto DiffChecks = RtPtrChecking.getDiffChecks();
       if (DiffChecks) {
@@ -2077,11 +2084,18 @@ class GeneratedRTChecks {
     if (OuterLoop)
       OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
 
-    BranchInst &BI =
-        *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
-    if (AddBranchWeights) {
+    // TODO: Branch to the vector preheader conditionally based on the number of
+    // non-aliasing elements. The scalar loop will likely be better if only one
+    // or two elements will be processed per vectorised loop iteration.
+
+    // Jump to the vector preheader unconditionally if it's safe to do so
+    // because an alias mask has been set up.
+    BranchInst &BI = HasAliasMask
+                         ? *BranchInst::Create(LoopVectorPreHeader)
+                         : *BranchInst::Create(Bypass, LoopVectorPreHeader,
+                                               MemRuntimeCheckCond);
+    if (!HasAliasMask && AddBranchWeights)
       setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
-    }
     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
     MemCheckBlock->getTerminator()->setDebugLoc(
         Pred->getTerminator()->getDebugLoc());
@@ -2564,7 +2578,10 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
     });
   }
 
-  LoopBypassBlocks.push_back(MemCheckBlock);
+  /// If an alias mask has been set up then we don't need the bypass as the
+  /// vector preheader will be branched to unconditionally
+  if (!RTChecks.HasAliasMask)
+    LoopBypassBlocks.push_back(MemCheckBlock);
 
   AddedSafetyChecks = true;
 
@@ -7097,7 +7114,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
   return VectorizationFactor::Disabled();
 }
 
-void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
+void LoopVectorizationPlanner::plan(
+    ElementCount UserVF, unsigned UserIC,
+    std::optional<ArrayRef<PointerDiffInfo>> RTChecks, bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
@@ -7106,6 +7125,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
     return;
 
+  ArrayRef<PointerDiffInfo> DiffChecks;
+  if (RTChecks.has_value() && useActiveLaneMask(CM.getTailFoldingStyle(true)))
+    DiffChecks = *RTChecks;
+
   // Invalidate interleave groups if all blocks of loop will be predicated.
   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
       !useMaskedInterleavedAccesses(TTI)) {
@@ -7138,7 +7161,7 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.collectInLoopReductions();
       if (CM.selectUserVectorizationFactor(UserVF)) {
         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-        buildVPlansWithVPRecipes(UserVF, UserVF);
+        buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecks, HasAliasMask);
         LLVM_DEBUG(printPlans(dbgs()));
         return;
       }
@@ -7167,8 +7190,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.collectInstsToScalarize(VF);
   }
 
-  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
-  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
+  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
+                           DiffChecks, HasAliasMask);
+  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
+                           DiffChecks, HasAliasMask);
 
   LLVM_DEBUG(printPlans(dbgs()));
 }
@@ -7690,7 +7715,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
                              CanonicalIVStartValue, State);
 
   BestVPlan.execute(&State);
-
   // 2.5 Collect reduction resume values.
   auto *ExitVPBB =
       cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
@@ -7923,7 +7947,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
   // reduction phis in the scalar loop preheader.
   if (EPI.SCEVSafetyCheck)
     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
-  if (EPI.MemSafetyCheck)
+  if (EPI.MemSafetyCheck && !RTChecks.HasAliasMask)
     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
 
@@ -8179,9 +8203,8 @@ void VPRecipeBuilder::createHeaderMask() {
 
   VPBuilder::InsertPointGuard Guard(Builder);
   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
-  VPValue *BlockMask = nullptr;
   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
-  BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
+  VPValue *BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
   BlockMaskCache[Header] = BlockMask;
 }
 
@@ -8720,14 +8743,16 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   return tryToWiden(Instr, Operands, VPBB);
 }
 
-void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
-                                                        ElementCount MaxVF) {
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
+    ElementCount MinVF, ElementCount MaxVF, ArrayRef<PointerDiffInfo> RTChecks,
+    bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
 
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
-    if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
+    if (auto Plan =
+            tryToBuildVPlanWithVPRecipes(SubRange, RTChecks, HasAliasMask)) {
       // Now optimize the initial VPlan.
       if (!Plan->hasVF(ElementCount::getFixed(1)))
         VPlanTransforms::truncateToMinimalBitwidths(*Plan,
@@ -8760,6 +8785,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 
   VPBuilder Builder(TopRegion->getExitingBasicBlock());
   // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
+  // If an alias mask is present, this will be replaced by an increment of the
+  // mask's popcount.
   auto *CanonicalIVIncrement = Builder.createOverflowingOp(
       Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
       "index.next");
@@ -8978,8 +9005,8 @@ static void addLiveOutsForFirstOrderRecurrences(
   }
 }
 
-VPlanPtr
-LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
+VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
+    VFRange &Range, ArrayRef<PointerDiffInfo> RTChecks, bool &HasAliasMask) {
 
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
 
@@ -9215,7 +9242,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     bool WithoutRuntimeCheck =
         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
-                                       WithoutRuntimeCheck);
+                                       WithoutRuntimeCheck, PSE, RTChecks);
+    if (ForControlFlow && !RTChecks.empty())
+      HasAliasMask = true;
   }
   return Plan;
 }
@@ -9699,6 +9728,7 @@ static bool processLoopInVPlanNativePath(
   // Mark the loop as already vectorized to avoid vectorizing again.
   Hints.setAlreadyVectorized();
   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
+
   return true;
 }
 
@@ -10030,18 +10060,23 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   ElementCount UserVF = Hints.getWidth();
   unsigned UserIC = Hints.getInterleave();
 
+  bool AddBranchWeights =
+      hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
+  GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
+                           AddBranchWeights);
+
   // Plan how to best vectorize.
-  LVP.plan(UserVF, UserIC);
+  LVP.plan(UserVF, UserIC,
+           LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(),
+           Checks.HasAliasMask);
   VectorizationFactor VF = LVP.computeBestVF();
+  if (Checks.HasAliasMask)
+    LoopsAliasMasked++;
   unsigned IC = 1;
 
   if (ORE->allowExtraAnalysis(LV_NAME))
     LVP.emitInvalidCostRemarks(ORE);
 
-  bool AddBranchWeights =
-      hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
-  GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
-                           AddBranchWeights);
   if (LVP.hasPlanWithVF(VF.Width)) {
     // Select the interleave count.
     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index c1b97791331bcf..161b2de2beb7ae 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -962,7 +962,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
 
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   // FIXME: Model VF * UF computation completely in VPlan.
-  assert(VFxUF.getNumUsers() && "VFxUF expected to always have users");
   unsigned UF = getUF();
   if (VF.getNumUsers()) {
     Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 59a084401cc9bf..4fb47714e0d19c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -895,6 +895,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     switch (R->getVPDefID()) {
     case VPRecipeBase::VPDerivedIVSC:
     case VPRecipeBase::VPEVLBasedIVPHISC:
+    case VPRecipeBase::VPAliasLaneMaskSC:
     case VPRecipeBase::VPExpandSCEVSC:
     case VPRecipeBase::VPInstructionSC:
     case VPRecipeBase::VPReductionEVLSC:
@@ -1270,6 +1271,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
     // last. The second operand must be a positive constant and <= VF.
     ExtractFromEnd,
     LogicalAnd, // Non-poison propagating logical And.
+    PopCount,
     // Add an offset in bytes (second operand) to a base pointer (first
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
@@ -2993,6 +2995,52 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   }
 };
 
+// Given a pointer A that is being stored to, and pointer B that is being
+// read from, both with unknown lengths, create a mask that disables
+// elements which could overlap across a loop iteration. For example, if A
+// is X and B is X + 2 with VF being 4, only the final two elements of the
+// loaded vector can be stored since they don't overlap with the stored
+// vector. %b.vec = load %b ; = [s, t, u, v]
+// [...]
+// store %a, %b.vec ; only u and v can be stored as their addresses don't
+// overlap with %a + (VF - 1)
+class VPAliasLaneMaskRecipe : public VPSingleDefRecipe {
+
+public:
+  VPAliasLaneMaskRecipe(VPValue *Src, VPValue *Sink, unsigned ElementSize)
+      : VPSingleDefRecipe(VPDef::VPAliasLaneMaskSC, {Src, Sink}),
+        ElementSize(ElementSize) {}
+
+  ~VPAliasLaneMaskRecipe() override = default;
+
+  VPAliasLaneMaskRecipe *clone() override {
+    return new VPAliasLaneMaskRecipe(getSourceValue(), getSinkValue(),
+                                     ElementSize);
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPAliasLaneMaskSC);
+
+  void execute(VPTransformState &State) override;
+
+  /// Get the VPValue* for the pointer being read from
+  VPValue *getSourceValue() const { return getOperand(0); }
+
+  // Get the size of the element(s) accessed by the pointers
+  unsigned getAccessedElementSize() const { return ElementSize; }
+
+  /// Get the VPValue* for the pointer being stored to
+  VPValue *getSinkValue() const { return getOperand(1); }
+
+private:
+  unsigned ElementSize;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// Recipe to expand a SCEV expression.
 class VPExpandSCEVRecipe : public VPSingleDefRecipe {
   const SCEV *Expr;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 41f13cc2d9a978..188dc56388dd9c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -399,6 +399,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
+  case VPInstruction::PopCount:
     return true;
   default:
     return false;
@@ -465,6 +466,29 @@ Value *VPInstruction::generate(VPTransformState &State) {
                                    {PredTy, ScalarTC->getType()},
                                    {VIVElem0, ScalarTC}, nullptr, Name);
   }
+  // Count the number of bits set in each lane and reduce the result to a scalar
+  case VPInstruction::PopCount: {
+    Value *Op = State.get(getOperand(0));
+    auto *VT = Op->getType();
+    Value *Cnt = Op;
+
+    // i1 vectors can just use the add reduction. Bigger elements need a ctpop
+    // first.
+    if (VT->getScalarSizeInBits() > 1)
+      Cnt = Builder.CreateIntrinsic(Intrinsic::ctpop, {VT}, {Cnt});
+
+    auto *VecVT = cast<VectorType>(VT);
+    // Extend to an i8 since i1 is too small to add with
+    if (VecVT->getElementType()->getScalarSizeInBits() < 8) {
+      Cnt = Builder.CreateCast(
+          Instruction::ZExt, Cnt,
+          VectorType::get(Builder.getInt8Ty(), VecVT->getElementCount()));
+    }
+
+    Cnt = Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, Cnt);
+    Cnt = Builder.CreateCast(Instruction::ZExt, Cnt, Builder.getInt64Ty());
+    return Cnt;
+  }
   case VPInstruction::FirstOrderRecurrenceSplice: {
     // Generate code to combine the previous and current values in vector v3.
     //
@@ -682,7 +706,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
 
 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractFromEnd ||
-         getOpcode() == VPInstruction::ComputeReductionResult;
+         getOpcode() == VPInstruction::ComputeReductionResult ||
+         getOpcode() == PopCount;
 }
 
 bool VPInstruction::isSingleScalar() const {
@@ -810,6 +835,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ResumePhi:
     O << "resume-phi";
     break;
+  case VPInstruction::PopCount:
+    O << "popcount";
+    break;
   case VPInstruction::ExplicitVectorLength:
     O << "EXPLICIT-VECTOR-LENGTH";
     break;
@@ -3135,6 +3163,40 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+void VPAliasLaneMaskRecipe::execute(VPTransformState &State) {
+  IRBuilderBase Builder = State.Builder;
+  Value *SinkValue = State.get(getSinkValue(), true);
+  Value *SourceValue = State.get(getSourceValue(), true);
+
+  Value *Diff = Builder.CreateSub(SourceValue, SinkValue, "sub.diff");
+  auto *Type = Diff->getType();
+  Value *MemEltSize = ConstantInt::get(Type, ElementSize);
+  Value *DiffDiv = Builder.CreateSDiv(Diff, MemEltSize, "diff");
+  // If the difference is negative then some elements may alias
+  Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_SLE, DiffDiv,
+                                  ConstantInt::get(Type, 0), "neg.compare");
+  // Splat the compare result then OR it with a lane mask
+  Value *Splat = Builder.CreateVectorSplat(State.VF, Cmp);
+  Value *DiffMask = Builder.CreateIntrinsic(
+      Intrinsic::get_active_lane_mask,
+      {VectorType::get(Builder.getInt1Ty(), State.VF), Type},
+      {ConstantInt::get(Type, 0), DiffDiv}, nullptr, "ptr.diff.lane.mask");
+  Value *Or = Builder.CreateBinOp(Instruction::BinaryOps::Or, DiffMask, Splat);
+  State.set(this, Or, /*IsScalar=*/false);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPAliasLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
+                                  VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+  getVPSingleValue()->printAsOperand(O, SlotTracker);
+  O << " = alias lane mask ";
+  getSourceValue()->printAsOperand(O, SlotTracker);
+  O << ", ";
+  getSinkValue()->printAsOperand(O, SlotTracker);
+}
+#endif
+
 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
   assert(!State.Lane && "cannot be used in per-lane");
   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d50f3c0c3f3e04..f1b01e32c62470 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1201,8 +1201,9 @@ void VPlanTransforms::optimize(VPlan &Plan) {
 //   %Negated = Not %ALM
 //   branch-on-cond %Negated
 //
-static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
-    VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
+static VPValue *addVPLaneMaskPhiAndUpdateExitBranch(
+    VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck,
+    PredicatedScalarEvolution &PSE, ArrayRef<PointerDiffInfo> RTChecks) {
   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
   auto *CanonicalIVPHI = Plan.getCanonicalIV();
@@ -1212,14 +1213,38 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
   // TODO: Check if dropping the flags is needed if
   // !DataAndControlFlowWithoutRuntimeCheck.
+  VPValue *IncVal = CanonicalIVIncrement->getOperand(1);
+  assert(IncVal != CanonicalIVPHI && "Unexpected operand order");
+
   CanonicalIVIncrement->dropPoisonGeneratingFlags();
   DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
+
   // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
   // we have to take unrolling into account. Each part needs to start at
   //   Part * VF
   auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor());
   VPBuilder Builder(VecPreheader);
 
+  // Create an alias mask for each possibly-aliasing pointer pair. If there
+  // are multiple they are combined together with ANDs.
+  VPValue *AliasMask = nullptr;
+
+  for (auto C : RTChecks) {
+    // FIXME: How to pass this info back?
+    //    HasAliasMask = true;
+    VPValue *Sink =
+        vputils::getOrCreateVPValueForSCEVExpr(Plan, C.SinkStart, *PSE.getSE());
+    VPValue *Src =
+        vputils::getOrCreateVPValueForSCEVExpr(Plan, C.SrcStart, *PSE.getSE());
+    VPAliasLaneMaskRecipe *M =
+        new VPAliasLaneMaskRecipe(Src, Sink, C.AccessSize);
+    VecPreheader->appendRecipe(M);
+    if (AliasMask)
+      AliasMask = Builder.createAnd(AliasMask, M);
+    else
+      AliasMask = M;
+  }
+
   // Create the ActiveLaneMask instruction using the correct start values.
   VPValue *TC = Plan.getTripCount();
 
@@ -1243,14 +1268,37 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       "index.part.next");
 
   // Create the active lane mask instruction in the VPlan preheader.
-  auto *EntryALM =
+  VPValue *Mask =
       Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
                            DL, "active.lane.mask.entry");
 
   // Now create the ActiveLaneMaskPhi recipe in the main loop using the
   // preheader ActiveLaneMask instruction.
-  auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
+  auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(Mask, DebugLoc());
   LaneMaskPhi->insertAfter(CanonicalIVPHI);
+  VPValue *LaneMask = LaneMaskPhi;
+  if (AliasMask) {
+    // Increment phi by correct amount.
+    Builder.setInsertPoint(CanonicalIVIncrement);
+
+    VPValue *IncrementBy = Builder.createNaryOp(VPInstruction::PopCount,
+                                                {AliasMask}, DL, "popcount");
+    Type *IVType = CanonicalIVPHI->getScalarType();
+
+    if (IVType->getScalarSizeInBits() < 64) {
+      auto *Cast =
+          new VPScalarCastRecipe(Instruction::Trunc, IncrementBy, IVType);
+      Cast->insertAfter(IncrementBy->getDefiningRecipe());
+      IncrementBy = Cast;
+    }
+    CanonicalIVIncrement->setOperand(1, IncrementBy);
+
+    // And the alias mask so the iteration only processes non-aliasing lanes
+    Builder.setInsertPoint(CanonicalIVPHI->getParent(),
+                           CanonicalIVPHI->getParent()->getFirstNonPhi());
+    LaneMask = Builder.createNaryOp(Instruction::BinaryOps::And,
+                                    {LaneMaskPhi, AliasMask}, DL);
+  }
 
   // Create the active lane mask for the next iteration of the loop before the
   // original terminator.
@@ -1269,7 +1317,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
   auto *NotMask = Builder.createNot(ALM, DL);
   Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
   OriginalTerminator->eraseFromParent();
-  return LaneMaskPhi;
+  return LaneMask;
 }
 
 /// Collect all VPValues representing a header mask through the (ICMP_ULE,
@@ -1319,7 +1367,9 @@ static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
 
 void VPlanTransforms::addActiveLaneMask(
     VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
-    bool DataAndControlFlowWithoutRuntimeCheck) {
+    bool DataAndControlFlowWithoutRuntimeCheck, PredicatedScalarEvolution &PSE,
+    ArrayRef<PointerDiffInfo> RTChecks) {
+
   assert((!DataAndControlFlowWithoutRuntimeCheck ||
           UseActiveLaneMaskForControlFlow) &&
          "DataAndControlFlowWithoutRuntimeCheck implies "
@@ -1328,14 +1378,14 @@ void VPlanTransforms::addActiveLaneMask(
   auto *FoundWidenCanonicalIVUser =
       find_if(Plan.getCanonicalIV()->users(),
               [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
-  assert(FoundWidenCanonicalIVUser &&
+  assert(FoundWidenCanonicalIVUser && *FoundWidenCanonicalIVUser &&
          "Must have widened canonical IV when tail folding!");
   auto *WideCanonicalIV =
       cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
-  VPSingleDefRecipe *LaneMask;
+  VPValue *LaneMask;
   if (UseActiveLaneMaskForControlFlow) {
     LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
-        Plan, DataAndControlFlowWithoutRuntimeCheck);
+        Plan, DataAndControlFlowWithoutRuntimeCheck, PSE, RTChecks);
   } else {
     VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
     LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 60a44bfb0dca6b..1eb975c73e495e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -77,9 +77,13 @@ struct VPlanTransforms {
   /// creation) and instead it is handled using active-lane-mask. \p
   /// DataAndControlFlowWithoutRuntimeCheck implies \p
   /// UseActiveLaneMaskForControlFlow.
+  /// RTChecks refers to the pointer pairs that need aliasing elements to be
+  /// masked off each loop iteration.
   static void addActiveLaneMask(VPlan &Plan,
                                 bool UseActiveLaneMaskForControlFlow,
-                                bool DataAndControlFlowWithoutRuntimeCheck);
+                                bool DataAndControlFlowWithoutRuntimeCheck,
+                                PredicatedScalarEvolution &PSE,
+                                ArrayRef<PointerDiffInfo> RTChecks);
 
   /// Insert truncates and extends for any truncated recipe. Redundant casts
   /// will be folded later.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index f2978b0a758b6a..e6265f96f4ada6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -336,6 +336,7 @@ class VPDef {
   using VPRecipeTy = enum {
     VPBranchOnMaskSC,
     VPDerivedIVSC,
+    VPAliasLaneMaskSC,
     VPExpandSCEVSC,
     VPIRInstructionSC,
     VPInstructionSC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
new file mode 100644
index 00000000000000..c4aafa97a334ca
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s
+
+define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: define dso_local void @alias_mask(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B4:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[C1]], [[B2]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP5]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B4]], [[C3]]
+; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 1
+; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp sle i64 [[DIFF]], 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[DIFF]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 16
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt i64 [[WIDE_TRIP_COUNT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i64 [[TMP11]], i64 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[TMP8]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD5]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP20]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP23:%.*]] = zext <vscale x 16 x i1> [[TMP8]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP24:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP23]])
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i8 [[TMP24]] to i64
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP25]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP13]])
+; CHECK-NEXT:    [[TMP26:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <vscale x 16 x i1> [[TMP26]], i32 0
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP29]], [[TMP28]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX6]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %add = add i8 %1, %0
+  %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
+  store i8 %add, ptr %arrayidx6, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 553989544c7787..fd993a1990224c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -139,14 +139,16 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:  entry:
 ; PRED-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; PRED-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; PRED-NEXT:    [[SRC3:%.*]] = ptrtoint ptr [[SRC]] to i64
+; PRED-NEXT:    [[DST2:%.*]] = ptrtoint ptr [[DST]] to i64
 ; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; PRED:       vector.memcheck:
 ; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
-; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST2]], [[SRC3]]
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
-; PRED-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
 ; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
@@ -156,6 +158,13 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; PRED-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[SRC2]], [[DST1]]
+; PRED-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 1
+; PRED-NEXT:    [[NEG_COMPARE:%.*]] = icmp sle i64 [[DIFF]], 0
+; PRED-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; PRED-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+; PRED-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
 ; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
@@ -170,9 +179,10 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP30:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
 ; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
 ; PRED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[TMP30]], <vscale x 8 x i8> poison)
 ; PRED-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
 ; PRED-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i16> [[TMP20]], [[TMP16]]
 ; PRED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
@@ -181,8 +191,11 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP25:%.*]] = trunc <vscale x 8 x i16> [[TMP24]] to <vscale x 8 x i8>
 ; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
 ; PRED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
-; PRED-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; PRED-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[TMP30]])
+; PRED-NEXT:    [[TMP31:%.*]] = zext <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 8 x i8>
+; PRED-NEXT:    [[TMP32:%.*]] = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP31]])
+; PRED-NEXT:    [[TMP33:%.*]] = zext i8 [[TMP32]] to i64
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP33]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]])
 ; PRED-NEXT:    [[TMP28:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
 ; PRED-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 8 x i1> [[TMP28]], i32 0
@@ -190,7 +203,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED:       middle.block:
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; PRED-NEXT:    br label [[LOOP:%.*]]
 ; PRED:       loop:
 ; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
index e7e63e55802fe1..53ef470e098d0d 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
@@ -185,18 +185,18 @@ define void @load_clamped_index_offset_1(ptr %A, ptr %B, i32 %N) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = urem i32 [[TMP10]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[WIDE_LOAD]], <i32 10, i32 10, i32 10, i32 10>
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = urem i32 [[TMP8]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[WIDE_LOAD]], <i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
index ecdc4ed416d47e..10e43dd97200b2 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
@@ -77,11 +77,11 @@ define void @different_steps_and_different_access_sizes(ptr %a, ptr %b, i64 %n)
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[N_SHL_2:%.]] = shl i64 %n, 2
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]]
-; CHECK-NEXT:    [[N_SHL_1:%.]] = shl i64 %n, 1
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr %a, i64 [[N_SHL_1]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr %b, [[SCEVGEP4]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr %b, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 %n, 1
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr %a, i64 [[TMP1]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr %b, [[SCEVGEP1]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr %a, [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
@@ -177,21 +177,22 @@ exit:
 define void @nested_loop_outer_iv_addrec_invariant_in_inner1(ptr %a, ptr %b, i64 %n) {
 ; CHECK-LABEL: @nested_loop_outer_iv_addrec_invariant_in_inner1(
 ; CHECK:        entry:
-; CHECK-NEXT:    [[N_SHL_2:%.]] = shl i64 %n, 2
-; CHECK-NEXT:    [[B_GEP_UPPER:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 %n, 2
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr %b, i64 [[TMP0]]
 ; CHECK-NEXT:    br label %outer
 
 ; CHECK:       outer.header:
-; CHECK:         [[OUTER_IV_SHL_2:%.]] = shl i64 %outer.iv, 2
-; CHECK-NEXT:    [[A_GEP_UPPER:%.*]] = getelementptr nuw i8, ptr %a, i64 [[OUTER_IV_SHL_2]]
-; CHECK-NEXT:    [[OUTER_IV_4:%.]] = add i64 [[OUTER_IV_SHL_2]], 4
-; CHECK-NEXT:    [[A_GEP_UPPER_4:%.*]] = getelementptr i8, ptr %a, i64 [[OUTER_IV_4]]
-; CHECK:         [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
-
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[OUTER_IV]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr nuw i8, ptr [[A:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 4
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OUTER_IV]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A_GEP_UPPER]], [[B_GEP_UPPER]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr %b, [[A_GEP_UPPER_4]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
 ;
@@ -226,22 +227,23 @@ exit:
 ; sink and source swapped.
 define void @nested_loop_outer_iv_addrec_invariant_in_inner2(ptr %a, ptr %b, i64 %n) {
 ; CHECK-LABEL: @nested_loop_outer_iv_addrec_invariant_in_inner2(
-; CHECK:        entry:
-; CHECK-NEXT:    [[N_SHL_2:%.]] = shl i64 %n, 2
-; CHECK-NEXT:    [[B_GEP_UPPER:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]]
-; CHECK-NEXT:    br label %outer
-
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N:%.*]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
 ; CHECK:       outer.header:
-; CHECK:         [[OUTER_IV_SHL_2:%.]] = shl i64 %outer.iv, 2
-; CHECK-NEXT:    [[A_GEP_UPPER:%.*]] = getelementptr nuw i8, ptr %a, i64 [[OUTER_IV_SHL_2]]
-; CHECK-NEXT:    [[OUTER_IV_4:%.]] = add i64 [[OUTER_IV_SHL_2]], 4
-; CHECK-NEXT:    [[A_GEP_UPPER_4:%.*]] = getelementptr i8, ptr %a, i64 [[OUTER_IV_4]]
-; CHECK:         [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[OUTER_IV]], 2
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr nuw i8, ptr [[A:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 4
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OUTER_IV]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
 
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr %b, [[A_GEP_UPPER_4]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A_GEP_UPPER]], [[B_GEP_UPPER]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
 ;
@@ -280,7 +282,7 @@ define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr noca
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
 ; CHECK-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
-; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[DST1]], [[SRC2]]
 ; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
 ; CHECK:       outer.loop:
 ; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ]
@@ -288,7 +290,7 @@ define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr noca
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[SUB]], 16
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
 entry:

>From d99a0536a62def7b191e8672ac131e58c841030c Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 22 Oct 2024 17:34:43 +0100
Subject: [PATCH 2/5] Update tests after rebase

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   7 +-
 llvm/test/CodeGen/AArch64/whilewr.ll          | 734 +++++++++---------
 2 files changed, 365 insertions(+), 376 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bf2f0674b5b65e..97650782c91356 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14178,13 +14178,10 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG,
     return SDValue();
 
   CondCodeSDNode *Cond = cast<CondCodeSDNode>(Cmp.getOperand(2));
-
   auto ComparatorConst = dyn_cast<ConstantSDNode>(Cmp.getOperand(1));
-  if (!ComparatorConst || ComparatorConst->getSExtValue() > 0 ||
-      Cond->get() != ISD::CondCode::SETLT)
+  if (!ComparatorConst || Cond->get() != ISD::CondCode::SETLT)
     return SDValue();
-  unsigned CompValue = std::abs(ComparatorConst->getSExtValue());
-  unsigned EltSize = CompValue + 1;
+  unsigned EltSize = std::abs(ComparatorConst->getSExtValue());
   if (!isPowerOf2_64(EltSize) || EltSize > 8)
     return SDValue();
 
diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll
index 9f1ea850792384..2269fd450180af 100644
--- a/llvm/test/CodeGen/AArch64/whilewr.ll
+++ b/llvm/test/CodeGen/AArch64/whilewr.ll
@@ -1,17 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2
+; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE2
+; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOSVE2
 
 define <vscale x 16 x i1> @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    whilewr p0.b, x1, x2
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: whilewr_8:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    whilewr p0.b, x1, x2
+; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_8:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
 ; CHECK-NOSVE2-NEXT:    sub x8, x1, x2
-; CHECK-NOSVE2-NEXT:    cmp x8, #0
+; CHECK-NOSVE2-NEXT:    cmp x8, #1
 ; CHECK-NOSVE2-NEXT:    cset w9, lt
 ; CHECK-NOSVE2-NEXT:    whilelo p0.b, xzr, x8
 ; CHECK-NOSVE2-NEXT:    sbfx x8, x9, #0, #1
@@ -22,7 +22,7 @@ entry:
   %c14 = ptrtoint ptr %c to i64
   %b15 = ptrtoint ptr %b to i64
   %sub.diff = sub i64 %b15, %c14
-  %neg.compare = icmp slt i64 %sub.diff, 0
+  %neg.compare = icmp slt i64 %sub.diff, 1
   %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
   %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
   %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
@@ -31,15 +31,15 @@ entry:
 }
 
 define <vscale x 16 x i1> @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_commutative:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    whilewr p0.b, x1, x2
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: whilewr_commutative:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    whilewr p0.b, x1, x2
+; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_commutative:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
 ; CHECK-NOSVE2-NEXT:    sub x8, x1, x2
-; CHECK-NOSVE2-NEXT:    cmp x8, #0
+; CHECK-NOSVE2-NEXT:    cmp x8, #1
 ; CHECK-NOSVE2-NEXT:    cset w9, lt
 ; CHECK-NOSVE2-NEXT:    whilelo p0.b, xzr, x8
 ; CHECK-NOSVE2-NEXT:    sbfx x8, x9, #0, #1
@@ -50,7 +50,7 @@ entry:
   %c14 = ptrtoint ptr %c to i64
   %b15 = ptrtoint ptr %b to i64
   %sub.diff = sub i64 %b15, %c14
-  %neg.compare = icmp slt i64 %sub.diff, 0
+  %neg.compare = icmp slt i64 %sub.diff, 1
   %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
   %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
   %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
@@ -59,15 +59,15 @@ entry:
 }
 
 define <vscale x 8 x i1> @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    whilewr p0.h, x1, x2
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: whilewr_16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    whilewr p0.h, x1, x2
+; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_16:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
 ; CHECK-NOSVE2-NEXT:    sub x8, x1, x2
-; CHECK-NOSVE2-NEXT:    cmn x8, #1
+; CHECK-NOSVE2-NEXT:    cmp x8, #2
 ; CHECK-NOSVE2-NEXT:    add x8, x8, x8, lsr #63
 ; CHECK-NOSVE2-NEXT:    cset w9, lt
 ; CHECK-NOSVE2-NEXT:    sbfx x9, x9, #0, #1
@@ -81,7 +81,7 @@ entry:
   %c15 = ptrtoint ptr %c to i64
   %sub.diff = sub i64 %b14, %c15
   %diff = sdiv i64 %sub.diff, 2
-  %neg.compare = icmp slt i64 %sub.diff, -1
+  %neg.compare = icmp slt i64 %sub.diff, 2
   %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
   %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
   %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
@@ -90,10 +90,10 @@ entry:
 }
 
 define <vscale x 4 x i1> @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    whilewr p0.s, x1, x2
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: whilewr_32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    whilewr p0.s, x1, x2
+; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_32:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
@@ -101,7 +101,7 @@ define <vscale x 4 x i1> @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    add x9, x8, #3
 ; CHECK-NOSVE2-NEXT:    cmp x8, #0
 ; CHECK-NOSVE2-NEXT:    csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT:    cmn x8, #3
+; CHECK-NOSVE2-NEXT:    cmp x8, #4
 ; CHECK-NOSVE2-NEXT:    cset w8, lt
 ; CHECK-NOSVE2-NEXT:    asr x9, x9, #2
 ; CHECK-NOSVE2-NEXT:    sbfx x8, x8, #0, #1
@@ -114,7 +114,7 @@ entry:
   %c13 = ptrtoint ptr %c to i64
   %sub.diff = sub i64 %b12, %c13
   %diff = sdiv i64 %sub.diff, 4
-  %neg.compare = icmp slt i64 %sub.diff, -3
+  %neg.compare = icmp slt i64 %sub.diff, 4
   %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
   %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
   %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
@@ -123,10 +123,10 @@ entry:
 }
 
 define <vscale x 2 x i1> @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    whilewr p0.d, x1, x2
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: whilewr_64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    whilewr p0.d, x1, x2
+; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_64:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
@@ -134,7 +134,7 @@ define <vscale x 2 x i1> @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    add x9, x8, #7
 ; CHECK-NOSVE2-NEXT:    cmp x8, #0
 ; CHECK-NOSVE2-NEXT:    csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT:    cmn x8, #7
+; CHECK-NOSVE2-NEXT:    cmp x8, #8
 ; CHECK-NOSVE2-NEXT:    cset w8, lt
 ; CHECK-NOSVE2-NEXT:    asr x9, x9, #3
 ; CHECK-NOSVE2-NEXT:    sbfx x8, x8, #0, #1
@@ -147,7 +147,7 @@ entry:
   %c13 = ptrtoint ptr %c to i64
   %sub.diff = sub i64 %b12, %c13
   %diff = sdiv i64 %sub.diff, 8
-  %neg.compare = icmp slt i64 %sub.diff, -7
+  %neg.compare = icmp slt i64 %sub.diff, 8
   %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
   %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
   %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
@@ -164,7 +164,7 @@ define <vscale x 1 x i1> @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n)
 ; CHECK-NEXT:    add x9, x8, #15
 ; CHECK-NEXT:    cmp x8, #0
 ; CHECK-NEXT:    csel x9, x9, x8, lt
-; CHECK-NEXT:    cmn x8, #15
+; CHECK-NEXT:    cmp x8, #16
 ; CHECK-NEXT:    asr x9, x9, #4
 ; CHECK-NEXT:    cset w8, lt
 ; CHECK-NEXT:    sbfx x8, x8, #0, #1
@@ -175,32 +175,12 @@ define <vscale x 1 x i1> @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n)
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: no_whilewr_128:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    sub x8, x1, x2
-; CHECK-NOSVE2-NEXT:    index z0.d, #0, #1
-; CHECK-NOSVE2-NEXT:    ptrue p0.d
-; CHECK-NOSVE2-NEXT:    add x9, x8, #15
-; CHECK-NOSVE2-NEXT:    cmp x8, #0
-; CHECK-NOSVE2-NEXT:    csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT:    cmn x8, #15
-; CHECK-NOSVE2-NEXT:    asr x9, x9, #4
-; CHECK-NOSVE2-NEXT:    cset w8, lt
-; CHECK-NOSVE2-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NOSVE2-NEXT:    mov z1.d, x9
-; CHECK-NOSVE2-NEXT:    whilelo p1.d, xzr, x8
-; CHECK-NOSVE2-NEXT:    cmphi p0.d, p0/z, z1.d, z0.d
-; CHECK-NOSVE2-NEXT:    punpklo p1.h, p1.b
-; CHECK-NOSVE2-NEXT:    punpklo p0.h, p0.b
-; CHECK-NOSVE2-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    ret
 entry:
   %b12 = ptrtoint ptr %b to i64
   %c13 = ptrtoint ptr %c to i64
   %sub.diff = sub i64 %b12, %c13
   %diff = sdiv i64 %sub.diff, 16
-  %neg.compare = icmp slt i64 %sub.diff, -15
+  %neg.compare = icmp slt i64 %sub.diff, 16
   %.splatinsert = insertelement <vscale x 1 x i1> poison, i1 %neg.compare, i64 0
   %.splat = shufflevector <vscale x 1 x i1> %.splatinsert, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %ptr.diff.lane.mask = tail call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 0, i64 %diff)
@@ -209,29 +189,29 @@ entry:
 }
 
 define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB6_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    whilewr p0.b, x1, x2
-; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    whilelo p1.b, xzr, x9
-; CHECK-NEXT:    cntp x10, p0, p0.b
-; CHECK-NEXT:    and x10, x10, #0xff
-; CHECK-NEXT:  .LBB6_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NEXT:    add z0.b, z1.b, z0.b
-; CHECK-NEXT:    st1b { z0.b }, p1, [x2, x8]
-; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p1.b, x8, x9
-; CHECK-NEXT:    b.mi .LBB6_2
-; CHECK-NEXT:  .LBB6_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: whilewr_loop_8:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    cmp w3, #1
+; CHECK-SVE2-NEXT:    b.lt .LBB6_3
+; CHECK-SVE2-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-SVE2-NEXT:    whilewr p0.b, x1, x2
+; CHECK-SVE2-NEXT:    mov w9, w3
+; CHECK-SVE2-NEXT:    mov x8, xzr
+; CHECK-SVE2-NEXT:    whilelo p1.b, xzr, x9
+; CHECK-SVE2-NEXT:    cntp x10, p0, p0.b
+; CHECK-SVE2-NEXT:    and x10, x10, #0xff
+; CHECK-SVE2-NEXT:  .LBB6_2: // %vector.body
+; CHECK-SVE2-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
+; CHECK-SVE2-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
+; CHECK-SVE2-NEXT:    ld1b { z1.b }, p1/z, [x1, x8]
+; CHECK-SVE2-NEXT:    add z0.b, z1.b, z0.b
+; CHECK-SVE2-NEXT:    st1b { z0.b }, p1, [x2, x8]
+; CHECK-SVE2-NEXT:    add x8, x8, x10
+; CHECK-SVE2-NEXT:    whilelo p1.b, x8, x9
+; CHECK-SVE2-NEXT:    b.mi .LBB6_2
+; CHECK-SVE2-NEXT:  .LBB6_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_loop_8:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
@@ -240,7 +220,7 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
 ; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    cmp x9, #0
+; CHECK-NOSVE2-NEXT:    cmp x9, #1
 ; CHECK-NOSVE2-NEXT:    cset w10, lt
 ; CHECK-NOSVE2-NEXT:    whilelo p0.b, xzr, x9
 ; CHECK-NOSVE2-NEXT:    sbfx x9, x10, #0, #1
@@ -271,7 +251,7 @@ for.body.preheader:
   %b15 = ptrtoint ptr %b to i64
   %wide.trip.count = zext nneg i32 %n to i64
   %sub.diff = sub i64 %b15, %c14
-  %neg.compare = icmp slt i64 %sub.diff, 0
+  %neg.compare = icmp slt i64 %sub.diff, 1
   %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
   %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
   %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
@@ -303,55 +283,58 @@ for.cond.cleanup:
 }
 
 define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB7_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w8, w3
-; CHECK-NEXT:    whilewr p1.h, x1, x2
-; CHECK-NEXT:    mov x9, xzr
-; CHECK-NEXT:    whilelo p0.h, xzr, x8
-; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:  .LBB7_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1, x9, lsl #1]
-; CHECK-NEXT:    add z0.h, z1.h, z0.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x2, x9, lsl #1]
-; CHECK-NEXT:    inch x9
-; CHECK-NEXT:    whilelo p0.h, x9, x8
-; CHECK-NEXT:    b.mi .LBB7_2
-; CHECK-NEXT:  .LBB7_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: whilewr_loop_16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    cmp w3, #1
+; CHECK-SVE2-NEXT:    b.lt .LBB7_3
+; CHECK-SVE2-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-SVE2-NEXT:    whilewr p0.h, x1, x2
+; CHECK-SVE2-NEXT:    mov w9, w3
+; CHECK-SVE2-NEXT:    mov x8, xzr
+; CHECK-SVE2-NEXT:    whilelo p1.h, xzr, x9
+; CHECK-SVE2-NEXT:    cntp x10, p0, p0.h
+; CHECK-SVE2-NEXT:    and x10, x10, #0xff
+; CHECK-SVE2-NEXT:  .LBB7_2: // %vector.body
+; CHECK-SVE2-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
+; CHECK-SVE2-NEXT:    ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
+; CHECK-SVE2-NEXT:    ld1h { z1.h }, p1/z, [x1, x8, lsl #1]
+; CHECK-SVE2-NEXT:    add z0.h, z1.h, z0.h
+; CHECK-SVE2-NEXT:    st1h { z0.h }, p1, [x2, x8, lsl #1]
+; CHECK-SVE2-NEXT:    add x8, x8, x10
+; CHECK-SVE2-NEXT:    whilelo p1.h, x8, x9
+; CHECK-SVE2-NEXT:    b.mi .LBB7_2
+; CHECK-SVE2-NEXT:  .LBB7_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_loop_16:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
 ; CHECK-NOSVE2-NEXT:    cmp w3, #1
 ; CHECK-NOSVE2-NEXT:    b.lt .LBB7_3
 ; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    sub x10, x1, x2
+; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
 ; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    whilelo p0.h, xzr, x9
-; CHECK-NOSVE2-NEXT:    cmn x10, #1
-; CHECK-NOSVE2-NEXT:    add x10, x10, x10, lsr #63
-; CHECK-NOSVE2-NEXT:    cset w11, lt
-; CHECK-NOSVE2-NEXT:    sbfx x11, x11, #0, #1
-; CHECK-NOSVE2-NEXT:    asr x10, x10, #1
-; CHECK-NOSVE2-NEXT:    whilelo p1.h, xzr, x11
-; CHECK-NOSVE2-NEXT:    whilelo p2.h, xzr, x10
-; CHECK-NOSVE2-NEXT:    cnth x10
-; CHECK-NOSVE2-NEXT:    mov p1.b, p2/m, p2.b
-; CHECK-NOSVE2-NEXT:    and p0.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT:    cmp x9, #2
+; CHECK-NOSVE2-NEXT:    add x9, x9, x9, lsr #63
+; CHECK-NOSVE2-NEXT:    cset w10, lt
+; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
+; CHECK-NOSVE2-NEXT:    asr x9, x9, #1
+; CHECK-NOSVE2-NEXT:    whilelo p0.h, xzr, x10
+; CHECK-NOSVE2-NEXT:    whilelo p1.h, xzr, x9
+; CHECK-NOSVE2-NEXT:    mov w9, w3
+; CHECK-NOSVE2-NEXT:    mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT:    whilelo p1.h, xzr, x9
+; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.h
+; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
 ; CHECK-NOSVE2-NEXT:  .LBB7_2: // %vector.body
 ; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; CHECK-NOSVE2-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
+; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT:    ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
+; CHECK-NOSVE2-NEXT:    ld1h { z1.h }, p1/z, [x1, x8, lsl #1]
 ; CHECK-NOSVE2-NEXT:    add z0.h, z1.h, z0.h
-; CHECK-NOSVE2-NEXT:    st1h { z0.h }, p0, [x2, x8, lsl #1]
+; CHECK-NOSVE2-NEXT:    st1h { z0.h }, p1, [x2, x8, lsl #1]
 ; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p0.h, x8, x9
+; CHECK-NOSVE2-NEXT:    whilelo p1.h, x8, x9
 ; CHECK-NOSVE2-NEXT:    b.mi .LBB7_2
 ; CHECK-NOSVE2-NEXT:  .LBB7_3: // %for.cond.cleanup
 ; CHECK-NOSVE2-NEXT:    ret
@@ -360,93 +343,97 @@ entry:
   br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
 
 for.body.preheader:
-  %b14 = ptrtoint ptr %b to i64
-  %c15 = ptrtoint ptr %c to i64
+  %c16 = ptrtoint ptr %c to i64
+  %b17 = ptrtoint ptr %b to i64
   %wide.trip.count = zext nneg i32 %n to i64
-  %0 = tail call i64 @llvm.vscale.i64()
-  %1 = shl nuw nsw i64 %0, 3
-  %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
-  %sub.diff = sub i64 %b14, %c15
+  %sub.diff = sub i64 %b17, %c16
   %diff = sdiv i64 %sub.diff, 2
-  %neg.compare = icmp slt i64 %sub.diff, -1
+  %neg.compare = icmp slt i64 %sub.diff, 2
   %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
   %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
   %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
-  %2 = and <vscale x 8 x i1> %active.lane.mask.alias, %active.lane.mask.entry
+  %0 = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
+  %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
+  %1 = zext <vscale x 8 x i1> %0 to <vscale x 8 x i8>
+  %2 = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> %1)
+  %3 = zext i8 %2 to i64
   br label %vector.body
 
 vector.body:
   %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 8 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %3 = getelementptr inbounds i16, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %3, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
-  %4 = getelementptr inbounds i16, ptr %b, i64 %index
-  %wide.masked.load16 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
-  %5 = add <vscale x 8 x i16> %wide.masked.load16, %wide.masked.load
-  %6 = getelementptr inbounds i16, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %5, ptr %6, i32 2, <vscale x 8 x i1> %active.lane.mask)
-  %index.next = add i64 %index, %1
+  %active.lane.mask = phi <vscale x 8 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %4 = and <vscale x 8 x i1> %active.lane.mask, %0
+  %5 = getelementptr inbounds i16, ptr %a, i64 %index
+  %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %5, i32 2, <vscale x 8 x i1> %4, <vscale x 8 x i16> poison)
+  %6 = getelementptr inbounds i16, ptr %b, i64 %index
+  %wide.masked.load18 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %6, i32 2, <vscale x 8 x i1> %4, <vscale x 8 x i16> poison)
+  %7 = add <vscale x 8 x i16> %wide.masked.load18, %wide.masked.load
+  %8 = getelementptr inbounds i16, ptr %c, i64 %index
+  tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %7, ptr %8, i32 2, <vscale x 8 x i1> %4)
+  %index.next = add i64 %index, %3
   %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %7 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0
-  br i1 %7, label %vector.body, label %for.cond.cleanup
+  %9 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0
+  br i1 %9, label %vector.body, label %for.cond.cleanup
 
 for.cond.cleanup:
   ret void
 }
 
 define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB8_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w8, w3
-; CHECK-NEXT:    whilewr p1.s, x1, x2
-; CHECK-NEXT:    mov x9, xzr
-; CHECK-NEXT:    whilelo p0.s, xzr, x8
-; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:  .LBB8_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, x9, lsl #2]
-; CHECK-NEXT:    add z0.s, z1.s, z0.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x2, x9, lsl #2]
-; CHECK-NEXT:    incw x9
-; CHECK-NEXT:    whilelo p0.s, x9, x8
-; CHECK-NEXT:    b.mi .LBB8_2
-; CHECK-NEXT:  .LBB8_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: whilewr_loop_32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    cmp w3, #1
+; CHECK-SVE2-NEXT:    b.lt .LBB8_3
+; CHECK-SVE2-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-SVE2-NEXT:    whilewr p0.s, x1, x2
+; CHECK-SVE2-NEXT:    mov w9, w3
+; CHECK-SVE2-NEXT:    mov x8, xzr
+; CHECK-SVE2-NEXT:    whilelo p1.s, xzr, x9
+; CHECK-SVE2-NEXT:    cntp x10, p0, p0.s
+; CHECK-SVE2-NEXT:    and x10, x10, #0xff
+; CHECK-SVE2-NEXT:  .LBB8_2: // %vector.body
+; CHECK-SVE2-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
+; CHECK-SVE2-NEXT:    ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
+; CHECK-SVE2-NEXT:    ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
+; CHECK-SVE2-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-SVE2-NEXT:    st1w { z0.s }, p1, [x2, x8, lsl #2]
+; CHECK-SVE2-NEXT:    add x8, x8, x10
+; CHECK-SVE2-NEXT:    whilelo p1.s, x8, x9
+; CHECK-SVE2-NEXT:    b.mi .LBB8_2
+; CHECK-SVE2-NEXT:  .LBB8_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_loop_32:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
 ; CHECK-NOSVE2-NEXT:    cmp w3, #1
 ; CHECK-NOSVE2-NEXT:    b.lt .LBB8_3
 ; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    sub x10, x1, x2
+; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
 ; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    whilelo p0.s, xzr, x9
-; CHECK-NOSVE2-NEXT:    add x11, x10, #3
-; CHECK-NOSVE2-NEXT:    cmp x10, #0
-; CHECK-NOSVE2-NEXT:    csel x11, x11, x10, lt
-; CHECK-NOSVE2-NEXT:    cmn x10, #3
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    asr x11, x11, #2
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p2.s, xzr, x11
+; CHECK-NOSVE2-NEXT:    add x10, x9, #3
+; CHECK-NOSVE2-NEXT:    cmp x9, #0
+; CHECK-NOSVE2-NEXT:    csel x10, x10, x9, lt
+; CHECK-NOSVE2-NEXT:    cmp x9, #4
+; CHECK-NOSVE2-NEXT:    cset w9, lt
+; CHECK-NOSVE2-NEXT:    asr x10, x10, #2
+; CHECK-NOSVE2-NEXT:    sbfx x9, x9, #0, #1
 ; CHECK-NOSVE2-NEXT:    whilelo p1.s, xzr, x10
-; CHECK-NOSVE2-NEXT:    cntw x10
-; CHECK-NOSVE2-NEXT:    mov p1.b, p2/m, p2.b
-; CHECK-NOSVE2-NEXT:    and p0.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT:    whilelo p0.s, xzr, x9
+; CHECK-NOSVE2-NEXT:    mov w9, w3
+; CHECK-NOSVE2-NEXT:    mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT:    whilelo p1.s, xzr, x9
+; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.s
+; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
 ; CHECK-NOSVE2-NEXT:  .LBB8_2: // %vector.body
 ; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; CHECK-NOSVE2-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
+; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT:    ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
+; CHECK-NOSVE2-NEXT:    ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
 ; CHECK-NOSVE2-NEXT:    add z0.s, z1.s, z0.s
-; CHECK-NOSVE2-NEXT:    st1w { z0.s }, p0, [x2, x8, lsl #2]
+; CHECK-NOSVE2-NEXT:    st1w { z0.s }, p1, [x2, x8, lsl #2]
 ; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p0.s, x8, x9
+; CHECK-NOSVE2-NEXT:    whilelo p1.s, x8, x9
 ; CHECK-NOSVE2-NEXT:    b.mi .LBB8_2
 ; CHECK-NOSVE2-NEXT:  .LBB8_3: // %for.cond.cleanup
 ; CHECK-NOSVE2-NEXT:    ret
@@ -455,93 +442,97 @@ entry:
   br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
 
 for.body.preheader:
-  %b12 = ptrtoint ptr %b to i64
-  %c13 = ptrtoint ptr %c to i64
+  %c14 = ptrtoint ptr %c to i64
+  %b15 = ptrtoint ptr %b to i64
   %wide.trip.count = zext nneg i32 %n to i64
-  %0 = tail call i64 @llvm.vscale.i64()
-  %1 = shl nuw nsw i64 %0, 2
-  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
-  %sub.diff = sub i64 %b12, %c13
+  %sub.diff = sub i64 %b15, %c14
   %diff = sdiv i64 %sub.diff, 4
-  %neg.compare = icmp slt i64 %sub.diff, -3
+  %neg.compare = icmp slt i64 %sub.diff, 4
   %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
   %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
   %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
-  %2 = and <vscale x 4 x i1> %active.lane.mask.alias, %active.lane.mask.entry
+  %0 = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
+  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
+  %1 = zext <vscale x 4 x i1> %0 to <vscale x 4 x i8>
+  %2 = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> %1)
+  %3 = zext i8 %2 to i64
   br label %vector.body
 
 vector.body:
   %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 4 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %3 = getelementptr inbounds i32, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
-  %4 = getelementptr inbounds i32, ptr %b, i64 %index
-  %wide.masked.load14 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
-  %5 = add <vscale x 4 x i32> %wide.masked.load14, %wide.masked.load
-  %6 = getelementptr inbounds i32, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %5, ptr %6, i32 4, <vscale x 4 x i1> %active.lane.mask)
-  %index.next = add i64 %index, %1
+  %active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %4 = and <vscale x 4 x i1> %active.lane.mask, %0
+  %5 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %5, i32 4, <vscale x 4 x i1> %4, <vscale x 4 x i32> poison)
+  %6 = getelementptr inbounds i32, ptr %b, i64 %index
+  %wide.masked.load16 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %6, i32 4, <vscale x 4 x i1> %4, <vscale x 4 x i32> poison)
+  %7 = add <vscale x 4 x i32> %wide.masked.load16, %wide.masked.load
+  %8 = getelementptr inbounds i32, ptr %c, i64 %index
+  tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %7, ptr %8, i32 4, <vscale x 4 x i1> %4)
+  %index.next = add i64 %index, %3
   %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %7 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
-  br i1 %7, label %vector.body, label %for.cond.cleanup
+  %9 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
+  br i1 %9, label %vector.body, label %for.cond.cleanup
 
 for.cond.cleanup:
   ret void
 }
 
 define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB9_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w8, w3
-; CHECK-NEXT:    whilewr p1.d, x1, x2
-; CHECK-NEXT:    mov x9, xzr
-; CHECK-NEXT:    whilelo p0.d, xzr, x8
-; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:  .LBB9_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1, x9, lsl #3]
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [x2, x9, lsl #3]
-; CHECK-NEXT:    incd x9
-; CHECK-NEXT:    whilelo p0.d, x9, x8
-; CHECK-NEXT:    b.mi .LBB9_2
-; CHECK-NEXT:  .LBB9_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: whilewr_loop_64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    cmp w3, #1
+; CHECK-SVE2-NEXT:    b.lt .LBB9_3
+; CHECK-SVE2-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-SVE2-NEXT:    whilewr p0.d, x1, x2
+; CHECK-SVE2-NEXT:    mov w9, w3
+; CHECK-SVE2-NEXT:    mov x8, xzr
+; CHECK-SVE2-NEXT:    whilelo p1.d, xzr, x9
+; CHECK-SVE2-NEXT:    cntp x10, p0, p0.d
+; CHECK-SVE2-NEXT:    and x10, x10, #0xff
+; CHECK-SVE2-NEXT:  .LBB9_2: // %vector.body
+; CHECK-SVE2-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
+; CHECK-SVE2-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
+; CHECK-SVE2-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
+; CHECK-SVE2-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-SVE2-NEXT:    st1d { z0.d }, p1, [x2, x8, lsl #3]
+; CHECK-SVE2-NEXT:    add x8, x8, x10
+; CHECK-SVE2-NEXT:    whilelo p1.d, x8, x9
+; CHECK-SVE2-NEXT:    b.mi .LBB9_2
+; CHECK-SVE2-NEXT:  .LBB9_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_loop_64:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
 ; CHECK-NOSVE2-NEXT:    cmp w3, #1
 ; CHECK-NOSVE2-NEXT:    b.lt .LBB9_3
 ; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    sub x10, x1, x2
+; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
 ; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    whilelo p0.d, xzr, x9
-; CHECK-NOSVE2-NEXT:    add x11, x10, #7
-; CHECK-NOSVE2-NEXT:    cmp x10, #0
-; CHECK-NOSVE2-NEXT:    csel x11, x11, x10, lt
-; CHECK-NOSVE2-NEXT:    cmn x10, #7
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    asr x11, x11, #3
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p2.d, xzr, x11
+; CHECK-NOSVE2-NEXT:    add x10, x9, #7
+; CHECK-NOSVE2-NEXT:    cmp x9, #0
+; CHECK-NOSVE2-NEXT:    csel x10, x10, x9, lt
+; CHECK-NOSVE2-NEXT:    cmp x9, #8
+; CHECK-NOSVE2-NEXT:    cset w9, lt
+; CHECK-NOSVE2-NEXT:    asr x10, x10, #3
+; CHECK-NOSVE2-NEXT:    sbfx x9, x9, #0, #1
 ; CHECK-NOSVE2-NEXT:    whilelo p1.d, xzr, x10
-; CHECK-NOSVE2-NEXT:    cntd x10
-; CHECK-NOSVE2-NEXT:    mov p1.b, p2/m, p2.b
-; CHECK-NOSVE2-NEXT:    and p0.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT:    whilelo p0.d, xzr, x9
+; CHECK-NOSVE2-NEXT:    mov w9, w3
+; CHECK-NOSVE2-NEXT:    mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT:    whilelo p1.d, xzr, x9
+; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.d
+; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
 ; CHECK-NOSVE2-NEXT:  .LBB9_2: // %vector.body
 ; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; CHECK-NOSVE2-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
+; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
+; CHECK-NOSVE2-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
 ; CHECK-NOSVE2-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NOSVE2-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
+; CHECK-NOSVE2-NEXT:    st1d { z0.d }, p1, [x2, x8, lsl #3]
 ; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p0.d, x8, x9
+; CHECK-NOSVE2-NEXT:    whilelo p1.d, x8, x9
 ; CHECK-NOSVE2-NEXT:    b.mi .LBB9_2
 ; CHECK-NOSVE2-NEXT:  .LBB9_3: // %for.cond.cleanup
 ; CHECK-NOSVE2-NEXT:    ret
@@ -550,67 +541,68 @@ entry:
   br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
 
 for.body.preheader:
-  %b12 = ptrtoint ptr %b to i64
-  %c13 = ptrtoint ptr %c to i64
+  %c14 = ptrtoint ptr %c to i64
+  %b15 = ptrtoint ptr %b to i64
   %wide.trip.count = zext nneg i32 %n to i64
-  %0 = tail call i64 @llvm.vscale.i64()
-  %1 = shl nuw nsw i64 %0, 1
-  %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
-  %sub.diff = sub i64 %b12, %c13
+  %sub.diff = sub i64 %b15, %c14
   %diff = sdiv i64 %sub.diff, 8
-  %neg.compare = icmp slt i64 %sub.diff, -7
+  %neg.compare = icmp slt i64 %sub.diff, 8
   %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
   %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
   %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
-  %2 = and <vscale x 2 x i1> %active.lane.mask.alias, %active.lane.mask.entry
+  %0 = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
+  %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
+  %1 = zext <vscale x 2 x i1> %0 to <vscale x 2 x i8>
+  %2 = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> %1)
+  %3 = zext i8 %2 to i64
   br label %vector.body
 
 vector.body:
   %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 2 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %3 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %3, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
-  %4 = getelementptr inbounds i64, ptr %b, i64 %index
-  %wide.masked.load14 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
-  %5 = add <vscale x 2 x i64> %wide.masked.load14, %wide.masked.load
-  %6 = getelementptr inbounds i64, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %5, ptr %6, i32 8, <vscale x 2 x i1> %active.lane.mask)
-  %index.next = add i64 %index, %1
+  %active.lane.mask = phi <vscale x 2 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %4 = and <vscale x 2 x i1> %active.lane.mask, %0
+  %5 = getelementptr inbounds i64, ptr %a, i64 %index
+  %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %5, i32 8, <vscale x 2 x i1> %4, <vscale x 2 x i64> poison)
+  %6 = getelementptr inbounds i64, ptr %b, i64 %index
+  %wide.masked.load16 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %6, i32 8, <vscale x 2 x i1> %4, <vscale x 2 x i64> poison)
+  %7 = add <vscale x 2 x i64> %wide.masked.load16, %wide.masked.load
+  %8 = getelementptr inbounds i64, ptr %c, i64 %index
+  tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %7, ptr %8, i32 8, <vscale x 2 x i1> %4)
+  %index.next = add i64 %index, %3
   %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %7 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0
-  br i1 %7, label %vector.body, label %for.cond.cleanup
+  %9 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0
+  br i1 %9, label %vector.body, label %for.cond.cleanup
 
 for.cond.cleanup:
   ret void
 }
 
 define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_multiple_8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB10_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    whilewr p0.b, x0, x2
-; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    whilewr p1.b, x1, x2
-; CHECK-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT:    whilelo p1.b, xzr, x9
-; CHECK-NEXT:    cntp x10, p0, p0.b
-; CHECK-NEXT:    and x10, x10, #0xff
-; CHECK-NEXT:  .LBB10_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NEXT:    add z0.b, z1.b, z0.b
-; CHECK-NEXT:    st1b { z0.b }, p1, [x2, x8]
-; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p1.b, x8, x9
-; CHECK-NEXT:    b.mi .LBB10_2
-; CHECK-NEXT:  .LBB10_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: whilewr_loop_multiple_8:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    cmp w3, #1
+; CHECK-SVE2-NEXT:    b.lt .LBB10_3
+; CHECK-SVE2-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-SVE2-NEXT:    whilewr p0.b, x0, x2
+; CHECK-SVE2-NEXT:    mov w9, w3
+; CHECK-SVE2-NEXT:    mov x8, xzr
+; CHECK-SVE2-NEXT:    whilewr p1.b, x1, x2
+; CHECK-SVE2-NEXT:    and p0.b, p0/z, p0.b, p1.b
+; CHECK-SVE2-NEXT:    whilelo p1.b, xzr, x9
+; CHECK-SVE2-NEXT:    cntp x10, p0, p0.b
+; CHECK-SVE2-NEXT:    and x10, x10, #0xff
+; CHECK-SVE2-NEXT:  .LBB10_2: // %vector.body
+; CHECK-SVE2-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
+; CHECK-SVE2-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
+; CHECK-SVE2-NEXT:    ld1b { z1.b }, p1/z, [x1, x8]
+; CHECK-SVE2-NEXT:    add z0.b, z1.b, z0.b
+; CHECK-SVE2-NEXT:    st1b { z0.b }, p1, [x2, x8]
+; CHECK-SVE2-NEXT:    add x8, x8, x10
+; CHECK-SVE2-NEXT:    whilelo p1.b, x8, x9
+; CHECK-SVE2-NEXT:    b.mi .LBB10_2
+; CHECK-SVE2-NEXT:  .LBB10_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_8:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
@@ -619,13 +611,13 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NOSVE2-NEXT:    sub x9, x0, x2
 ; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    cmp x9, #0
+; CHECK-NOSVE2-NEXT:    cmp x9, #1
 ; CHECK-NOSVE2-NEXT:    cset w10, lt
 ; CHECK-NOSVE2-NEXT:    whilelo p0.b, xzr, x9
 ; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
 ; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
 ; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x10
-; CHECK-NOSVE2-NEXT:    cmp x9, #0
+; CHECK-NOSVE2-NEXT:    cmp x9, #1
 ; CHECK-NOSVE2-NEXT:    cset w10, lt
 ; CHECK-NOSVE2-NEXT:    whilelo p3.b, xzr, x9
 ; CHECK-NOSVE2-NEXT:    mov w9, w3
@@ -659,13 +651,13 @@ for.body.preheader:
   %b16 = ptrtoint ptr %b to i64
   %wide.trip.count = zext nneg i32 %n to i64
   %sub.diff = sub i64 %a15, %c14
-  %neg.compare = icmp slt i64 %sub.diff, 0
+  %neg.compare = icmp slt i64 %sub.diff, 1
   %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
   %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
   %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
   %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
   %sub.diff18 = sub i64 %b16, %c14
-  %neg.compare20 = icmp slt i64 %sub.diff18, 0
+  %neg.compare20 = icmp slt i64 %sub.diff18, 1
   %.splatinsert21 = insertelement <vscale x 16 x i1> poison, i1 %neg.compare20, i64 0
   %.splat22 = shufflevector <vscale x 16 x i1> %.splatinsert21, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
   %ptr.diff.lane.mask23 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff18)
@@ -698,31 +690,31 @@ for.cond.cleanup:
 }
 
 define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_multiple_16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB11_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    whilewr p0.h, x0, x2
-; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    whilewr p1.h, x1, x2
-; CHECK-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT:    whilelo p1.h, xzr, x9
-; CHECK-NEXT:    cntp x10, p0, p0.h
-; CHECK-NEXT:    and x10, x10, #0xff
-; CHECK-NEXT:  .LBB11_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
-; CHECK-NEXT:    ld1h { z1.h }, p1/z, [x1, x8, lsl #1]
-; CHECK-NEXT:    add z0.h, z1.h, z0.h
-; CHECK-NEXT:    st1h { z0.h }, p1, [x2, x8, lsl #1]
-; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p1.h, x8, x9
-; CHECK-NEXT:    b.mi .LBB11_2
-; CHECK-NEXT:  .LBB11_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: whilewr_loop_multiple_16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    cmp w3, #1
+; CHECK-SVE2-NEXT:    b.lt .LBB11_3
+; CHECK-SVE2-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-SVE2-NEXT:    whilewr p0.h, x0, x2
+; CHECK-SVE2-NEXT:    mov w9, w3
+; CHECK-SVE2-NEXT:    mov x8, xzr
+; CHECK-SVE2-NEXT:    whilewr p1.h, x1, x2
+; CHECK-SVE2-NEXT:    and p0.b, p0/z, p0.b, p1.b
+; CHECK-SVE2-NEXT:    whilelo p1.h, xzr, x9
+; CHECK-SVE2-NEXT:    cntp x10, p0, p0.h
+; CHECK-SVE2-NEXT:    and x10, x10, #0xff
+; CHECK-SVE2-NEXT:  .LBB11_2: // %vector.body
+; CHECK-SVE2-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
+; CHECK-SVE2-NEXT:    ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
+; CHECK-SVE2-NEXT:    ld1h { z1.h }, p1/z, [x1, x8, lsl #1]
+; CHECK-SVE2-NEXT:    add z0.h, z1.h, z0.h
+; CHECK-SVE2-NEXT:    st1h { z0.h }, p1, [x2, x8, lsl #1]
+; CHECK-SVE2-NEXT:    add x8, x8, x10
+; CHECK-SVE2-NEXT:    whilelo p1.h, x8, x9
+; CHECK-SVE2-NEXT:    b.mi .LBB11_2
+; CHECK-SVE2-NEXT:  .LBB11_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_16:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
@@ -731,7 +723,7 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NOSVE2-NEXT:    sub x9, x0, x2
 ; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    cmn x9, #1
+; CHECK-NOSVE2-NEXT:    cmp x9, #2
 ; CHECK-NOSVE2-NEXT:    add x9, x9, x9, lsr #63
 ; CHECK-NOSVE2-NEXT:    cset w10, lt
 ; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
@@ -740,7 +732,7 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    sub x10, x1, x2
 ; CHECK-NOSVE2-NEXT:    whilelo p1.h, xzr, x9
 ; CHECK-NOSVE2-NEXT:    add x9, x10, x10, lsr #63
-; CHECK-NOSVE2-NEXT:    cmn x10, #1
+; CHECK-NOSVE2-NEXT:    cmp x10, #2
 ; CHECK-NOSVE2-NEXT:    cset w10, lt
 ; CHECK-NOSVE2-NEXT:    asr x9, x9, #1
 ; CHECK-NOSVE2-NEXT:    mov p0.b, p1/m, p1.b
@@ -776,14 +768,14 @@ for.body.preheader:
   %wide.trip.count = zext nneg i32 %n to i64
   %sub.diff = sub i64 %a15, %c14
   %diff = sdiv i64 %sub.diff, 2
-  %neg.compare = icmp slt i64 %sub.diff, -1
+  %neg.compare = icmp slt i64 %sub.diff, 2
   %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
   %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
   %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
   %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
   %sub.diff18 = sub i64 %b16, %c14
   %diff19 = sdiv i64 %sub.diff18, 2
-  %neg.compare20 = icmp slt i64 %sub.diff18, -1
+  %neg.compare20 = icmp slt i64 %sub.diff18, 2
   %.splatinsert21 = insertelement <vscale x 8 x i1> poison, i1 %neg.compare20, i64 0
   %.splat22 = shufflevector <vscale x 8 x i1> %.splatinsert21, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
   %ptr.diff.lane.mask23 = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff19)
@@ -816,31 +808,31 @@ for.cond.cleanup:
 }
 
 define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_multiple_32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB12_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    whilewr p0.s, x0, x2
-; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    whilewr p1.s, x1, x2
-; CHECK-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT:    whilelo p1.s, xzr, x9
-; CHECK-NEXT:    cntp x10, p0, p0.s
-; CHECK-NEXT:    and x10, x10, #0xff
-; CHECK-NEXT:  .LBB12_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
-; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
-; CHECK-NEXT:    add z0.s, z1.s, z0.s
-; CHECK-NEXT:    st1w { z0.s }, p1, [x2, x8, lsl #2]
-; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p1.s, x8, x9
-; CHECK-NEXT:    b.mi .LBB12_2
-; CHECK-NEXT:  .LBB12_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: whilewr_loop_multiple_32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    cmp w3, #1
+; CHECK-SVE2-NEXT:    b.lt .LBB12_3
+; CHECK-SVE2-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-SVE2-NEXT:    whilewr p0.s, x0, x2
+; CHECK-SVE2-NEXT:    mov w9, w3
+; CHECK-SVE2-NEXT:    mov x8, xzr
+; CHECK-SVE2-NEXT:    whilewr p1.s, x1, x2
+; CHECK-SVE2-NEXT:    and p0.b, p0/z, p0.b, p1.b
+; CHECK-SVE2-NEXT:    whilelo p1.s, xzr, x9
+; CHECK-SVE2-NEXT:    cntp x10, p0, p0.s
+; CHECK-SVE2-NEXT:    and x10, x10, #0xff
+; CHECK-SVE2-NEXT:  .LBB12_2: // %vector.body
+; CHECK-SVE2-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
+; CHECK-SVE2-NEXT:    ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
+; CHECK-SVE2-NEXT:    ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
+; CHECK-SVE2-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-SVE2-NEXT:    st1w { z0.s }, p1, [x2, x8, lsl #2]
+; CHECK-SVE2-NEXT:    add x8, x8, x10
+; CHECK-SVE2-NEXT:    whilelo p1.s, x8, x9
+; CHECK-SVE2-NEXT:    b.mi .LBB12_2
+; CHECK-SVE2-NEXT:  .LBB12_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_32:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
@@ -852,7 +844,7 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    add x10, x9, #3
 ; CHECK-NOSVE2-NEXT:    cmp x9, #0
 ; CHECK-NOSVE2-NEXT:    csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT:    cmn x9, #3
+; CHECK-NOSVE2-NEXT:    cmp x9, #4
 ; CHECK-NOSVE2-NEXT:    asr x9, x10, #2
 ; CHECK-NOSVE2-NEXT:    cset w10, lt
 ; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
@@ -862,7 +854,7 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    add x10, x9, #3
 ; CHECK-NOSVE2-NEXT:    cmp x9, #0
 ; CHECK-NOSVE2-NEXT:    csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT:    cmn x9, #3
+; CHECK-NOSVE2-NEXT:    cmp x9, #4
 ; CHECK-NOSVE2-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NOSVE2-NEXT:    cset w9, lt
 ; CHECK-NOSVE2-NEXT:    asr x10, x10, #2
@@ -898,14 +890,14 @@ for.body.preheader:
   %wide.trip.count = zext nneg i32 %n to i64
   %sub.diff = sub i64 %a13, %c12
   %diff = sdiv i64 %sub.diff, 4
-  %neg.compare = icmp slt i64 %sub.diff, -3
+  %neg.compare = icmp slt i64 %sub.diff, 4
   %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
   %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
   %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
   %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
   %sub.diff16 = sub i64 %b14, %c12
   %diff17 = sdiv i64 %sub.diff16, 4
-  %neg.compare18 = icmp slt i64 %sub.diff16, -3
+  %neg.compare18 = icmp slt i64 %sub.diff16, 4
   %.splatinsert19 = insertelement <vscale x 4 x i1> poison, i1 %neg.compare18, i64 0
   %.splat20 = shufflevector <vscale x 4 x i1> %.splatinsert19, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
   %ptr.diff.lane.mask21 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff17)
@@ -938,31 +930,31 @@ for.cond.cleanup:
 }
 
 define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_multiple_64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB13_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    whilewr p0.d, x0, x2
-; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    whilewr p1.d, x1, x2
-; CHECK-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT:    whilelo p1.d, xzr, x9
-; CHECK-NEXT:    cntp x10, p0, p0.d
-; CHECK-NEXT:    and x10, x10, #0xff
-; CHECK-NEXT:  .LBB13_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    st1d { z0.d }, p1, [x2, x8, lsl #3]
-; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p1.d, x8, x9
-; CHECK-NEXT:    b.mi .LBB13_2
-; CHECK-NEXT:  .LBB13_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: whilewr_loop_multiple_64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    cmp w3, #1
+; CHECK-SVE2-NEXT:    b.lt .LBB13_3
+; CHECK-SVE2-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-SVE2-NEXT:    whilewr p0.d, x0, x2
+; CHECK-SVE2-NEXT:    mov w9, w3
+; CHECK-SVE2-NEXT:    mov x8, xzr
+; CHECK-SVE2-NEXT:    whilewr p1.d, x1, x2
+; CHECK-SVE2-NEXT:    and p0.b, p0/z, p0.b, p1.b
+; CHECK-SVE2-NEXT:    whilelo p1.d, xzr, x9
+; CHECK-SVE2-NEXT:    cntp x10, p0, p0.d
+; CHECK-SVE2-NEXT:    and x10, x10, #0xff
+; CHECK-SVE2-NEXT:  .LBB13_2: // %vector.body
+; CHECK-SVE2-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
+; CHECK-SVE2-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
+; CHECK-SVE2-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
+; CHECK-SVE2-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-SVE2-NEXT:    st1d { z0.d }, p1, [x2, x8, lsl #3]
+; CHECK-SVE2-NEXT:    add x8, x8, x10
+; CHECK-SVE2-NEXT:    whilelo p1.d, x8, x9
+; CHECK-SVE2-NEXT:    b.mi .LBB13_2
+; CHECK-SVE2-NEXT:  .LBB13_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_64:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
@@ -974,7 +966,7 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    add x10, x9, #7
 ; CHECK-NOSVE2-NEXT:    cmp x9, #0
 ; CHECK-NOSVE2-NEXT:    csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT:    cmn x9, #7
+; CHECK-NOSVE2-NEXT:    cmp x9, #8
 ; CHECK-NOSVE2-NEXT:    asr x9, x10, #3
 ; CHECK-NOSVE2-NEXT:    cset w10, lt
 ; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
@@ -984,7 +976,7 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    add x10, x9, #7
 ; CHECK-NOSVE2-NEXT:    cmp x9, #0
 ; CHECK-NOSVE2-NEXT:    csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT:    cmn x9, #7
+; CHECK-NOSVE2-NEXT:    cmp x9, #8
 ; CHECK-NOSVE2-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NOSVE2-NEXT:    cset w9, lt
 ; CHECK-NOSVE2-NEXT:    asr x10, x10, #3
@@ -1020,14 +1012,14 @@ for.body.preheader:
   %wide.trip.count = zext nneg i32 %n to i64
   %sub.diff = sub i64 %a13, %c12
   %diff = sdiv i64 %sub.diff, 8
-  %neg.compare = icmp slt i64 %sub.diff, -7
+  %neg.compare = icmp slt i64 %sub.diff, 8
   %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
   %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
   %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
   %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
   %sub.diff16 = sub i64 %b14, %c12
   %diff17 = sdiv i64 %sub.diff16, 8
-  %neg.compare18 = icmp slt i64 %sub.diff16, -7
+  %neg.compare18 = icmp slt i64 %sub.diff16, 8
   %.splatinsert19 = insertelement <vscale x 2 x i1> poison, i1 %neg.compare18, i64 0
   %.splat20 = shufflevector <vscale x 2 x i1> %.splatinsert19, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
   %ptr.diff.lane.mask21 = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff17)

>From a23d8caa7f511e38dbb50b224a77695c4446f55d Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 22 Oct 2024 17:36:20 +0100
Subject: [PATCH 3/5] Don't lower a read-after-write hazard

---
 .../llvm/Analysis/LoopAccessAnalysis.h        |   5 +-
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |   5 +-
 llvm/lib/Transforms/Utils/LoopUtils.cpp       |   3 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  10 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   6 +
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   2 +-
 llvm/test/CodeGen/AArch64/whilewr.ll          | 230 +++++++++++++-----
 7 files changed, 189 insertions(+), 72 deletions(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index a35bc7402d1a89..66796c9a0db90f 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -435,11 +435,12 @@ struct PointerDiffInfo {
   const SCEV *SinkStart;
   unsigned AccessSize;
   bool NeedsFreeze;
+  bool WriteAfterRead;
 
   PointerDiffInfo(const SCEV *SrcStart, const SCEV *SinkStart,
-                  unsigned AccessSize, bool NeedsFreeze)
+                  unsigned AccessSize, bool NeedsFreeze, bool WriteAfterRead)
       : SrcStart(SrcStart), SinkStart(SinkStart), AccessSize(AccessSize),
-        NeedsFreeze(NeedsFreeze) {}
+        NeedsFreeze(NeedsFreeze), WriteAfterRead(WriteAfterRead) {}
 };
 
 /// Holds information about the memory runtime legality checks to verify
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index d35bf6818d4379..0906d2e3487d6e 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -367,11 +367,14 @@ bool RuntimePointerChecking::tryToCreateDiffCheck(
     }
   }
 
+  bool WriteAfterRead = isa<LoadInst>(SrcInsts[0]);
+
   LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
                     << "SrcStart: " << *SrcStartInt << '\n'
                     << "SinkStartInt: " << *SinkStartInt << '\n');
   DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize,
-                          Src->NeedsFreeze || Sink->NeedsFreeze);
+                          Src->NeedsFreeze || Sink->NeedsFreeze,
+                          WriteAfterRead);
   return true;
 }
 
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 70047273c3b9af..9fcbadec084d40 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -2004,7 +2004,8 @@ Value *llvm::addDiffRuntimeChecks(
   // Map to keep track of created compares, The key is the pair of operands for
   // the compare, to allow detecting and re-using redundant compares.
   DenseMap<std::pair<Value *, Value *>, Value *> SeenCompares;
-  for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze] : Checks) {
+  for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze,
+                    WriteAfterRead] : Checks) {
     Type *Ty = SinkStart->getType();
     // Compute VF * IC * AccessSize.
     auto *VFTimesUFTimesSize =
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 4fb47714e0d19c..b956b03805b54f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3007,15 +3007,16 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
 class VPAliasLaneMaskRecipe : public VPSingleDefRecipe {
 
 public:
-  VPAliasLaneMaskRecipe(VPValue *Src, VPValue *Sink, unsigned ElementSize)
+  VPAliasLaneMaskRecipe(VPValue *Src, VPValue *Sink, unsigned ElementSize,
+                        bool WriteAfterRead)
       : VPSingleDefRecipe(VPDef::VPAliasLaneMaskSC, {Src, Sink}),
-        ElementSize(ElementSize) {}
+        ElementSize(ElementSize), WriteAfterRead(WriteAfterRead) {}
 
   ~VPAliasLaneMaskRecipe() override = default;
 
   VPAliasLaneMaskRecipe *clone() override {
     return new VPAliasLaneMaskRecipe(getSourceValue(), getSinkValue(),
-                                     ElementSize);
+                                     ElementSize, WriteAfterRead);
   }
 
   VP_CLASSOF_IMPL(VPDef::VPAliasLaneMaskSC);
@@ -3031,8 +3032,11 @@ class VPAliasLaneMaskRecipe : public VPSingleDefRecipe {
   /// Get the VPValue* for the pointer being stored to
   VPValue *getSinkValue() const { return getOperand(1); }
 
+  bool isWriteAfterRead() const { return WriteAfterRead; }
+
 private:
   unsigned ElementSize;
+  bool WriteAfterRead;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 188dc56388dd9c..9939602bfcf618 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3170,6 +3170,10 @@ void VPAliasLaneMaskRecipe::execute(VPTransformState &State) {
 
   Value *Diff = Builder.CreateSub(SourceValue, SinkValue, "sub.diff");
   auto *Type = Diff->getType();
+  if (!WriteAfterRead)
+    Diff = Builder.CreateIntrinsic(
+        Intrinsic::abs, {Type},
+        {Diff, ConstantInt::getFalse(Builder.getInt1Ty())});
   Value *MemEltSize = ConstantInt::get(Type, ElementSize);
   Value *DiffDiv = Builder.CreateSDiv(Diff, MemEltSize, "diff");
   // If the difference is negative then some elements may alias
@@ -3194,6 +3198,8 @@ void VPAliasLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
   getSourceValue()->printAsOperand(O, SlotTracker);
   O << ", ";
   getSinkValue()->printAsOperand(O, SlotTracker);
+  O << " (" << (WriteAfterRead ? "write-after-read" : "read-after-write")
+    << ")";
 }
 #endif
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f1b01e32c62470..a97f7292f2c644 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1237,7 +1237,7 @@ static VPValue *addVPLaneMaskPhiAndUpdateExitBranch(
     VPValue *Src =
         vputils::getOrCreateVPValueForSCEVExpr(Plan, C.SrcStart, *PSE.getSE());
     VPAliasLaneMaskRecipe *M =
-        new VPAliasLaneMaskRecipe(Src, Sink, C.AccessSize);
+        new VPAliasLaneMaskRecipe(Src, Sink, C.AccessSize, C.WriteAfterRead);
     VecPreheader->appendRecipe(M);
     if (AliasMask)
       AliasMask = Builder.createAnd(AliasMask, M);
diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll
index 2269fd450180af..58f786e3885fa0 100644
--- a/llvm/test/CodeGen/AArch64/whilewr.ll
+++ b/llvm/test/CodeGen/AArch64/whilewr.ll
@@ -30,6 +30,31 @@ entry:
   ret <vscale x 16 x i1> %active.lane.mask.alias
 }
 
+define <vscale x 16 x i1> @whilerw_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilerw_8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subs x8, x2, x1
+; CHECK-NEXT:    cneg x8, x8, mi
+; CHECK-NEXT:    cmp x8, #1
+; CHECK-NEXT:    cset w9, lt
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    sbfx x8, x9, #0, #1
+; CHECK-NEXT:    whilelo p1.b, xzr, x8
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT:    ret
+entry:
+  %b24 = ptrtoint ptr %b to i64
+  %c25 = ptrtoint ptr %c to i64
+  %sub.diff = sub i64 %c25, %b24
+  %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
+  %neg.compare = icmp slt i64 %0, 1
+  %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
+  %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %0)
+  %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
+  ret <vscale x 16 x i1> %active.lane.mask.alias
+}
+
 define <vscale x 16 x i1> @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-LABEL: whilewr_commutative:
 ; CHECK-SVE2:       // %bb.0: // %entry
@@ -89,6 +114,31 @@ entry:
   ret <vscale x 8 x i1> %active.lane.mask.alias
 }
 
+define <vscale x 8 x i1> @whilerw_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilerw_16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subs x8, x2, x1
+; CHECK-NEXT:    cneg x8, x8, mi
+; CHECK-NEXT:    cmp x8, #2
+; CHECK-NEXT:    cset w9, lt
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    sbfx x8, x9, #0, #1
+; CHECK-NEXT:    whilelo p1.h, xzr, x8
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT:    ret
+entry:
+  %b24 = ptrtoint ptr %b to i64
+  %c25 = ptrtoint ptr %c to i64
+  %sub.diff = sub i64 %c25, %b24
+  %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
+  %neg.compare = icmp slt i64 %0, 2
+  %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
+  %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %0)
+  %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
+  ret <vscale x 8 x i1> %active.lane.mask.alias
+}
+
 define <vscale x 4 x i1> @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-LABEL: whilewr_32:
 ; CHECK-SVE2:       // %bb.0: // %entry
@@ -122,6 +172,31 @@ entry:
   ret <vscale x 4 x i1> %active.lane.mask.alias
 }
 
+define <vscale x 4 x i1> @whilerw_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilerw_32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subs x8, x2, x1
+; CHECK-NEXT:    cneg x8, x8, mi
+; CHECK-NEXT:    cmp x8, #4
+; CHECK-NEXT:    cset w9, lt
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    sbfx x8, x9, #0, #1
+; CHECK-NEXT:    whilelo p1.s, xzr, x8
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT:    ret
+entry:
+  %b24 = ptrtoint ptr %b to i64
+  %c25 = ptrtoint ptr %c to i64
+  %sub.diff = sub i64 %c25, %b24
+  %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
+  %neg.compare = icmp slt i64 %0, 4
+  %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
+  %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %0)
+  %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
+  ret <vscale x 4 x i1> %active.lane.mask.alias
+}
+
 define <vscale x 2 x i1> @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-LABEL: whilewr_64:
 ; CHECK-SVE2:       // %bb.0: // %entry
@@ -155,6 +230,31 @@ entry:
   ret <vscale x 2 x i1> %active.lane.mask.alias
 }
 
+define <vscale x 2 x i1> @whilerw_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilerw_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subs x8, x2, x1
+; CHECK-NEXT:    cneg x8, x8, mi
+; CHECK-NEXT:    cmp x8, #8
+; CHECK-NEXT:    cset w9, lt
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    sbfx x8, x9, #0, #1
+; CHECK-NEXT:    whilelo p1.d, xzr, x8
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT:    ret
+entry:
+  %b24 = ptrtoint ptr %b to i64
+  %c25 = ptrtoint ptr %c to i64
+  %sub.diff = sub i64 %c25, %b24
+  %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
+  %neg.compare = icmp slt i64 %0, 8
+  %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
+  %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %0)
+  %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
+  ret <vscale x 2 x i1> %active.lane.mask.alias
+}
+
 define <vscale x 1 x i1> @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-LABEL: no_whilewr_128:
 ; CHECK:       // %bb.0: // %entry
@@ -192,7 +292,7 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-LABEL: whilewr_loop_8:
 ; CHECK-SVE2:       // %bb.0: // %entry
 ; CHECK-SVE2-NEXT:    cmp w3, #1
-; CHECK-SVE2-NEXT:    b.lt .LBB6_3
+; CHECK-SVE2-NEXT:    b.lt .LBB10_3
 ; CHECK-SVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-SVE2-NEXT:    whilewr p0.b, x1, x2
 ; CHECK-SVE2-NEXT:    mov w9, w3
@@ -200,7 +300,7 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-NEXT:    whilelo p1.b, xzr, x9
 ; CHECK-SVE2-NEXT:    cntp x10, p0, p0.b
 ; CHECK-SVE2-NEXT:    and x10, x10, #0xff
-; CHECK-SVE2-NEXT:  .LBB6_2: // %vector.body
+; CHECK-SVE2-NEXT:  .LBB10_2: // %vector.body
 ; CHECK-SVE2-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
 ; CHECK-SVE2-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
@@ -209,14 +309,14 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-NEXT:    st1b { z0.b }, p1, [x2, x8]
 ; CHECK-SVE2-NEXT:    add x8, x8, x10
 ; CHECK-SVE2-NEXT:    whilelo p1.b, x8, x9
-; CHECK-SVE2-NEXT:    b.mi .LBB6_2
-; CHECK-SVE2-NEXT:  .LBB6_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT:    b.mi .LBB10_2
+; CHECK-SVE2-NEXT:  .LBB10_3: // %for.cond.cleanup
 ; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_loop_8:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
 ; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB6_3
+; CHECK-NOSVE2-NEXT:    b.lt .LBB10_3
 ; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
 ; CHECK-NOSVE2-NEXT:    mov x8, xzr
@@ -230,7 +330,7 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x9
 ; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.b
 ; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB6_2: // %vector.body
+; CHECK-NOSVE2-NEXT:  .LBB10_2: // %vector.body
 ; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
 ; CHECK-NOSVE2-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
@@ -239,8 +339,8 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    st1b { z0.b }, p1, [x2, x8]
 ; CHECK-NOSVE2-NEXT:    add x8, x8, x10
 ; CHECK-NOSVE2-NEXT:    whilelo p1.b, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB6_2
-; CHECK-NOSVE2-NEXT:  .LBB6_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT:    b.mi .LBB10_2
+; CHECK-NOSVE2-NEXT:  .LBB10_3: // %for.cond.cleanup
 ; CHECK-NOSVE2-NEXT:    ret
 entry:
   %cmp11 = icmp sgt i32 %n, 0
@@ -286,7 +386,7 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-LABEL: whilewr_loop_16:
 ; CHECK-SVE2:       // %bb.0: // %entry
 ; CHECK-SVE2-NEXT:    cmp w3, #1
-; CHECK-SVE2-NEXT:    b.lt .LBB7_3
+; CHECK-SVE2-NEXT:    b.lt .LBB11_3
 ; CHECK-SVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-SVE2-NEXT:    whilewr p0.h, x1, x2
 ; CHECK-SVE2-NEXT:    mov w9, w3
@@ -294,7 +394,7 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-NEXT:    whilelo p1.h, xzr, x9
 ; CHECK-SVE2-NEXT:    cntp x10, p0, p0.h
 ; CHECK-SVE2-NEXT:    and x10, x10, #0xff
-; CHECK-SVE2-NEXT:  .LBB7_2: // %vector.body
+; CHECK-SVE2-NEXT:  .LBB11_2: // %vector.body
 ; CHECK-SVE2-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
 ; CHECK-SVE2-NEXT:    ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
@@ -303,14 +403,14 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-NEXT:    st1h { z0.h }, p1, [x2, x8, lsl #1]
 ; CHECK-SVE2-NEXT:    add x8, x8, x10
 ; CHECK-SVE2-NEXT:    whilelo p1.h, x8, x9
-; CHECK-SVE2-NEXT:    b.mi .LBB7_2
-; CHECK-SVE2-NEXT:  .LBB7_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT:    b.mi .LBB11_2
+; CHECK-SVE2-NEXT:  .LBB11_3: // %for.cond.cleanup
 ; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_loop_16:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
 ; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB7_3
+; CHECK-NOSVE2-NEXT:    b.lt .LBB11_3
 ; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
 ; CHECK-NOSVE2-NEXT:    mov x8, xzr
@@ -326,7 +426,7 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    whilelo p1.h, xzr, x9
 ; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.h
 ; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB7_2: // %vector.body
+; CHECK-NOSVE2-NEXT:  .LBB11_2: // %vector.body
 ; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
 ; CHECK-NOSVE2-NEXT:    ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
@@ -335,8 +435,8 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    st1h { z0.h }, p1, [x2, x8, lsl #1]
 ; CHECK-NOSVE2-NEXT:    add x8, x8, x10
 ; CHECK-NOSVE2-NEXT:    whilelo p1.h, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB7_2
-; CHECK-NOSVE2-NEXT:  .LBB7_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT:    b.mi .LBB11_2
+; CHECK-NOSVE2-NEXT:  .LBB11_3: // %for.cond.cleanup
 ; CHECK-NOSVE2-NEXT:    ret
 entry:
   %cmp11 = icmp sgt i32 %n, 0
@@ -383,7 +483,7 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-LABEL: whilewr_loop_32:
 ; CHECK-SVE2:       // %bb.0: // %entry
 ; CHECK-SVE2-NEXT:    cmp w3, #1
-; CHECK-SVE2-NEXT:    b.lt .LBB8_3
+; CHECK-SVE2-NEXT:    b.lt .LBB12_3
 ; CHECK-SVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-SVE2-NEXT:    whilewr p0.s, x1, x2
 ; CHECK-SVE2-NEXT:    mov w9, w3
@@ -391,7 +491,7 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-NEXT:    whilelo p1.s, xzr, x9
 ; CHECK-SVE2-NEXT:    cntp x10, p0, p0.s
 ; CHECK-SVE2-NEXT:    and x10, x10, #0xff
-; CHECK-SVE2-NEXT:  .LBB8_2: // %vector.body
+; CHECK-SVE2-NEXT:  .LBB12_2: // %vector.body
 ; CHECK-SVE2-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
 ; CHECK-SVE2-NEXT:    ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
@@ -400,14 +500,14 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-NEXT:    st1w { z0.s }, p1, [x2, x8, lsl #2]
 ; CHECK-SVE2-NEXT:    add x8, x8, x10
 ; CHECK-SVE2-NEXT:    whilelo p1.s, x8, x9
-; CHECK-SVE2-NEXT:    b.mi .LBB8_2
-; CHECK-SVE2-NEXT:  .LBB8_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT:    b.mi .LBB12_2
+; CHECK-SVE2-NEXT:  .LBB12_3: // %for.cond.cleanup
 ; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_loop_32:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
 ; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB8_3
+; CHECK-NOSVE2-NEXT:    b.lt .LBB12_3
 ; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
 ; CHECK-NOSVE2-NEXT:    mov x8, xzr
@@ -425,7 +525,7 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    whilelo p1.s, xzr, x9
 ; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.s
 ; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB8_2: // %vector.body
+; CHECK-NOSVE2-NEXT:  .LBB12_2: // %vector.body
 ; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
 ; CHECK-NOSVE2-NEXT:    ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
@@ -434,8 +534,8 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    st1w { z0.s }, p1, [x2, x8, lsl #2]
 ; CHECK-NOSVE2-NEXT:    add x8, x8, x10
 ; CHECK-NOSVE2-NEXT:    whilelo p1.s, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB8_2
-; CHECK-NOSVE2-NEXT:  .LBB8_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT:    b.mi .LBB12_2
+; CHECK-NOSVE2-NEXT:  .LBB12_3: // %for.cond.cleanup
 ; CHECK-NOSVE2-NEXT:    ret
 entry:
   %cmp9 = icmp sgt i32 %n, 0
@@ -482,7 +582,7 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-LABEL: whilewr_loop_64:
 ; CHECK-SVE2:       // %bb.0: // %entry
 ; CHECK-SVE2-NEXT:    cmp w3, #1
-; CHECK-SVE2-NEXT:    b.lt .LBB9_3
+; CHECK-SVE2-NEXT:    b.lt .LBB13_3
 ; CHECK-SVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-SVE2-NEXT:    whilewr p0.d, x1, x2
 ; CHECK-SVE2-NEXT:    mov w9, w3
@@ -490,7 +590,7 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-NEXT:    whilelo p1.d, xzr, x9
 ; CHECK-SVE2-NEXT:    cntp x10, p0, p0.d
 ; CHECK-SVE2-NEXT:    and x10, x10, #0xff
-; CHECK-SVE2-NEXT:  .LBB9_2: // %vector.body
+; CHECK-SVE2-NEXT:  .LBB13_2: // %vector.body
 ; CHECK-SVE2-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
 ; CHECK-SVE2-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
@@ -499,14 +599,14 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-NEXT:    st1d { z0.d }, p1, [x2, x8, lsl #3]
 ; CHECK-SVE2-NEXT:    add x8, x8, x10
 ; CHECK-SVE2-NEXT:    whilelo p1.d, x8, x9
-; CHECK-SVE2-NEXT:    b.mi .LBB9_2
-; CHECK-SVE2-NEXT:  .LBB9_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT:    b.mi .LBB13_2
+; CHECK-SVE2-NEXT:  .LBB13_3: // %for.cond.cleanup
 ; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_loop_64:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
 ; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB9_3
+; CHECK-NOSVE2-NEXT:    b.lt .LBB13_3
 ; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
 ; CHECK-NOSVE2-NEXT:    mov x8, xzr
@@ -524,7 +624,7 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    whilelo p1.d, xzr, x9
 ; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.d
 ; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB9_2: // %vector.body
+; CHECK-NOSVE2-NEXT:  .LBB13_2: // %vector.body
 ; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
 ; CHECK-NOSVE2-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
@@ -533,8 +633,8 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    st1d { z0.d }, p1, [x2, x8, lsl #3]
 ; CHECK-NOSVE2-NEXT:    add x8, x8, x10
 ; CHECK-NOSVE2-NEXT:    whilelo p1.d, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB9_2
-; CHECK-NOSVE2-NEXT:  .LBB9_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT:    b.mi .LBB13_2
+; CHECK-NOSVE2-NEXT:  .LBB13_3: // %for.cond.cleanup
 ; CHECK-NOSVE2-NEXT:    ret
 entry:
   %cmp9 = icmp sgt i32 %n, 0
@@ -581,7 +681,7 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-LABEL: whilewr_loop_multiple_8:
 ; CHECK-SVE2:       // %bb.0: // %entry
 ; CHECK-SVE2-NEXT:    cmp w3, #1
-; CHECK-SVE2-NEXT:    b.lt .LBB10_3
+; CHECK-SVE2-NEXT:    b.lt .LBB14_3
 ; CHECK-SVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-SVE2-NEXT:    whilewr p0.b, x0, x2
 ; CHECK-SVE2-NEXT:    mov w9, w3
@@ -591,7 +691,7 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-NEXT:    whilelo p1.b, xzr, x9
 ; CHECK-SVE2-NEXT:    cntp x10, p0, p0.b
 ; CHECK-SVE2-NEXT:    and x10, x10, #0xff
-; CHECK-SVE2-NEXT:  .LBB10_2: // %vector.body
+; CHECK-SVE2-NEXT:  .LBB14_2: // %vector.body
 ; CHECK-SVE2-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
 ; CHECK-SVE2-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
@@ -600,14 +700,14 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-NEXT:    st1b { z0.b }, p1, [x2, x8]
 ; CHECK-SVE2-NEXT:    add x8, x8, x10
 ; CHECK-SVE2-NEXT:    whilelo p1.b, x8, x9
-; CHECK-SVE2-NEXT:    b.mi .LBB10_2
-; CHECK-SVE2-NEXT:  .LBB10_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT:    b.mi .LBB14_2
+; CHECK-SVE2-NEXT:  .LBB14_3: // %for.cond.cleanup
 ; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_8:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
 ; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB10_3
+; CHECK-NOSVE2-NEXT:    b.lt .LBB14_3
 ; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NOSVE2-NEXT:    sub x9, x0, x2
 ; CHECK-NOSVE2-NEXT:    mov x8, xzr
@@ -629,7 +729,7 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x9
 ; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.b
 ; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB10_2: // %vector.body
+; CHECK-NOSVE2-NEXT:  .LBB14_2: // %vector.body
 ; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
 ; CHECK-NOSVE2-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
@@ -638,8 +738,8 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    st1b { z0.b }, p1, [x2, x8]
 ; CHECK-NOSVE2-NEXT:    add x8, x8, x10
 ; CHECK-NOSVE2-NEXT:    whilelo p1.b, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB10_2
-; CHECK-NOSVE2-NEXT:  .LBB10_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT:    b.mi .LBB14_2
+; CHECK-NOSVE2-NEXT:  .LBB14_3: // %for.cond.cleanup
 ; CHECK-NOSVE2-NEXT:    ret
 entry:
   %cmp11 = icmp sgt i32 %n, 0
@@ -693,7 +793,7 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-LABEL: whilewr_loop_multiple_16:
 ; CHECK-SVE2:       // %bb.0: // %entry
 ; CHECK-SVE2-NEXT:    cmp w3, #1
-; CHECK-SVE2-NEXT:    b.lt .LBB11_3
+; CHECK-SVE2-NEXT:    b.lt .LBB15_3
 ; CHECK-SVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-SVE2-NEXT:    whilewr p0.h, x0, x2
 ; CHECK-SVE2-NEXT:    mov w9, w3
@@ -703,7 +803,7 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-NEXT:    whilelo p1.h, xzr, x9
 ; CHECK-SVE2-NEXT:    cntp x10, p0, p0.h
 ; CHECK-SVE2-NEXT:    and x10, x10, #0xff
-; CHECK-SVE2-NEXT:  .LBB11_2: // %vector.body
+; CHECK-SVE2-NEXT:  .LBB15_2: // %vector.body
 ; CHECK-SVE2-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
 ; CHECK-SVE2-NEXT:    ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
@@ -712,14 +812,14 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-NEXT:    st1h { z0.h }, p1, [x2, x8, lsl #1]
 ; CHECK-SVE2-NEXT:    add x8, x8, x10
 ; CHECK-SVE2-NEXT:    whilelo p1.h, x8, x9
-; CHECK-SVE2-NEXT:    b.mi .LBB11_2
-; CHECK-SVE2-NEXT:  .LBB11_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT:    b.mi .LBB15_2
+; CHECK-SVE2-NEXT:  .LBB15_3: // %for.cond.cleanup
 ; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_16:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
 ; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB11_3
+; CHECK-NOSVE2-NEXT:    b.lt .LBB15_3
 ; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NOSVE2-NEXT:    sub x9, x0, x2
 ; CHECK-NOSVE2-NEXT:    mov x8, xzr
@@ -745,7 +845,7 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    whilelo p1.h, xzr, x9
 ; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.h
 ; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB11_2: // %vector.body
+; CHECK-NOSVE2-NEXT:  .LBB15_2: // %vector.body
 ; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
 ; CHECK-NOSVE2-NEXT:    ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
@@ -754,8 +854,8 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    st1h { z0.h }, p1, [x2, x8, lsl #1]
 ; CHECK-NOSVE2-NEXT:    add x8, x8, x10
 ; CHECK-NOSVE2-NEXT:    whilelo p1.h, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB11_2
-; CHECK-NOSVE2-NEXT:  .LBB11_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT:    b.mi .LBB15_2
+; CHECK-NOSVE2-NEXT:  .LBB15_3: // %for.cond.cleanup
 ; CHECK-NOSVE2-NEXT:    ret
 entry:
   %cmp11 = icmp sgt i32 %n, 0
@@ -811,7 +911,7 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-LABEL: whilewr_loop_multiple_32:
 ; CHECK-SVE2:       // %bb.0: // %entry
 ; CHECK-SVE2-NEXT:    cmp w3, #1
-; CHECK-SVE2-NEXT:    b.lt .LBB12_3
+; CHECK-SVE2-NEXT:    b.lt .LBB16_3
 ; CHECK-SVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-SVE2-NEXT:    whilewr p0.s, x0, x2
 ; CHECK-SVE2-NEXT:    mov w9, w3
@@ -821,7 +921,7 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-NEXT:    whilelo p1.s, xzr, x9
 ; CHECK-SVE2-NEXT:    cntp x10, p0, p0.s
 ; CHECK-SVE2-NEXT:    and x10, x10, #0xff
-; CHECK-SVE2-NEXT:  .LBB12_2: // %vector.body
+; CHECK-SVE2-NEXT:  .LBB16_2: // %vector.body
 ; CHECK-SVE2-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
 ; CHECK-SVE2-NEXT:    ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
@@ -830,14 +930,14 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-NEXT:    st1w { z0.s }, p1, [x2, x8, lsl #2]
 ; CHECK-SVE2-NEXT:    add x8, x8, x10
 ; CHECK-SVE2-NEXT:    whilelo p1.s, x8, x9
-; CHECK-SVE2-NEXT:    b.mi .LBB12_2
-; CHECK-SVE2-NEXT:  .LBB12_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT:    b.mi .LBB16_2
+; CHECK-SVE2-NEXT:  .LBB16_3: // %for.cond.cleanup
 ; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_32:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
 ; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB12_3
+; CHECK-NOSVE2-NEXT:    b.lt .LBB16_3
 ; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NOSVE2-NEXT:    sub x9, x0, x2
 ; CHECK-NOSVE2-NEXT:    mov x8, xzr
@@ -867,7 +967,7 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    whilelo p1.s, xzr, x9
 ; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.s
 ; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB12_2: // %vector.body
+; CHECK-NOSVE2-NEXT:  .LBB16_2: // %vector.body
 ; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
 ; CHECK-NOSVE2-NEXT:    ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
@@ -876,8 +976,8 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    st1w { z0.s }, p1, [x2, x8, lsl #2]
 ; CHECK-NOSVE2-NEXT:    add x8, x8, x10
 ; CHECK-NOSVE2-NEXT:    whilelo p1.s, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB12_2
-; CHECK-NOSVE2-NEXT:  .LBB12_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT:    b.mi .LBB16_2
+; CHECK-NOSVE2-NEXT:  .LBB16_3: // %for.cond.cleanup
 ; CHECK-NOSVE2-NEXT:    ret
 entry:
   %cmp9 = icmp sgt i32 %n, 0
@@ -933,7 +1033,7 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-LABEL: whilewr_loop_multiple_64:
 ; CHECK-SVE2:       // %bb.0: // %entry
 ; CHECK-SVE2-NEXT:    cmp w3, #1
-; CHECK-SVE2-NEXT:    b.lt .LBB13_3
+; CHECK-SVE2-NEXT:    b.lt .LBB17_3
 ; CHECK-SVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-SVE2-NEXT:    whilewr p0.d, x0, x2
 ; CHECK-SVE2-NEXT:    mov w9, w3
@@ -943,7 +1043,7 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-NEXT:    whilelo p1.d, xzr, x9
 ; CHECK-SVE2-NEXT:    cntp x10, p0, p0.d
 ; CHECK-SVE2-NEXT:    and x10, x10, #0xff
-; CHECK-SVE2-NEXT:  .LBB13_2: // %vector.body
+; CHECK-SVE2-NEXT:  .LBB17_2: // %vector.body
 ; CHECK-SVE2-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
 ; CHECK-SVE2-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
@@ -952,14 +1052,14 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-SVE2-NEXT:    st1d { z0.d }, p1, [x2, x8, lsl #3]
 ; CHECK-SVE2-NEXT:    add x8, x8, x10
 ; CHECK-SVE2-NEXT:    whilelo p1.d, x8, x9
-; CHECK-SVE2-NEXT:    b.mi .LBB13_2
-; CHECK-SVE2-NEXT:  .LBB13_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT:    b.mi .LBB17_2
+; CHECK-SVE2-NEXT:  .LBB17_3: // %for.cond.cleanup
 ; CHECK-SVE2-NEXT:    ret
 ;
 ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_64:
 ; CHECK-NOSVE2:       // %bb.0: // %entry
 ; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB13_3
+; CHECK-NOSVE2-NEXT:    b.lt .LBB17_3
 ; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NOSVE2-NEXT:    sub x9, x0, x2
 ; CHECK-NOSVE2-NEXT:    mov x8, xzr
@@ -989,7 +1089,7 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    whilelo p1.d, xzr, x9
 ; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.d
 ; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB13_2: // %vector.body
+; CHECK-NOSVE2-NEXT:  .LBB17_2: // %vector.body
 ; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
 ; CHECK-NOSVE2-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
@@ -998,8 +1098,8 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NOSVE2-NEXT:    st1d { z0.d }, p1, [x2, x8, lsl #3]
 ; CHECK-NOSVE2-NEXT:    add x8, x8, x10
 ; CHECK-NOSVE2-NEXT:    whilelo p1.d, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB13_2
-; CHECK-NOSVE2-NEXT:  .LBB13_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT:    b.mi .LBB17_2
+; CHECK-NOSVE2-NEXT:  .LBB17_3: // %for.cond.cleanup
 ; CHECK-NOSVE2-NEXT:    ret
 entry:
   %cmp9 = icmp sgt i32 %n, 0
@@ -1076,3 +1176,5 @@ declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64)
 declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr nocapture, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x i64>)
 
 declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr nocapture, i32 immarg, <vscale x 2 x i1>)
+
+declare i64 @llvm.abs.i64(i64, i1 immarg)

>From f67fa7a1e9d8acae39fac5a937f0f4fbdd816840 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Thu, 24 Oct 2024 11:41:54 +0100
Subject: [PATCH 4/5] Lower read-after-write to whilerw

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  34 +++--
 llvm/test/CodeGen/AArch64/whilewr.ll          | 127 +++++++++++-------
 2 files changed, 106 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 97650782c91356..bfb120aa492bbf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14186,7 +14186,16 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG,
     return SDValue();
 
   SDValue Diff = Cmp.getOperand(0);
-  if (Diff.getOpcode() != ISD::SUB || Diff.getValueType() != MVT::i64)
+  SDValue NonAbsDiff = Diff;
+  bool WriteAfterRead = true;
+  // A read-after-write will have an abs call on the diff
+  if (Diff.getOpcode() == ISD::ABS) {
+    NonAbsDiff = Diff.getOperand(0);
+    WriteAfterRead = false;
+  }
+
+  if (NonAbsDiff.getOpcode() != ISD::SUB ||
+      NonAbsDiff.getValueType() != MVT::i64)
     return SDValue();
 
   if (!isNullConstant(LaneMask.getOperand(1)) ||
@@ -14207,8 +14216,13 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG,
       // it's positive, otherwise the difference plus the element size if it's
       // negative: pos_diff = diff < 0 ? (diff + 7) : diff
       SDValue Select = DiffDiv.getOperand(0);
+      SDValue SelectOp3 = Select.getOperand(3);
+      // Check for an abs in the case of a read-after-write
+      if (!WriteAfterRead && SelectOp3.getOpcode() == ISD::ABS)
+        SelectOp3 = SelectOp3.getOperand(0);
+
       // Make sure the difference is being compared by the select
-      if (Select.getOpcode() != ISD::SELECT_CC || Select.getOperand(3) != Diff)
+      if (Select.getOpcode() != ISD::SELECT_CC || SelectOp3 != NonAbsDiff)
         return SDValue();
       // Make sure it's checking if the difference is less than 0
       if (!isNullConstant(Select.getOperand(1)) ||
@@ -14240,22 +14254,26 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG,
   } else if (LaneMask.getOperand(2) != Diff)
     return SDValue();
 
-  SDValue StorePtr = Diff.getOperand(0);
-  SDValue ReadPtr = Diff.getOperand(1);
+  SDValue StorePtr = NonAbsDiff.getOperand(0);
+  SDValue ReadPtr = NonAbsDiff.getOperand(1);
 
   unsigned IntrinsicID = 0;
   switch (EltSize) {
   case 1:
-    IntrinsicID = Intrinsic::aarch64_sve_whilewr_b;
+    IntrinsicID = WriteAfterRead ? Intrinsic::aarch64_sve_whilewr_b
+                                 : Intrinsic::aarch64_sve_whilerw_b;
     break;
   case 2:
-    IntrinsicID = Intrinsic::aarch64_sve_whilewr_h;
+    IntrinsicID = WriteAfterRead ? Intrinsic::aarch64_sve_whilewr_h
+                                 : Intrinsic::aarch64_sve_whilerw_h;
     break;
   case 4:
-    IntrinsicID = Intrinsic::aarch64_sve_whilewr_s;
+    IntrinsicID = WriteAfterRead ? Intrinsic::aarch64_sve_whilewr_s
+                                 : Intrinsic::aarch64_sve_whilerw_s;
     break;
   case 8:
-    IntrinsicID = Intrinsic::aarch64_sve_whilewr_d;
+    IntrinsicID = WriteAfterRead ? Intrinsic::aarch64_sve_whilewr_d
+                                 : Intrinsic::aarch64_sve_whilerw_d;
     break;
   default:
     return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll
index 58f786e3885fa0..05d49df7dc56f7 100644
--- a/llvm/test/CodeGen/AArch64/whilewr.ll
+++ b/llvm/test/CodeGen/AArch64/whilewr.ll
@@ -31,17 +31,22 @@ entry:
 }
 
 define <vscale x 16 x i1> @whilerw_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilerw_8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    subs x8, x2, x1
-; CHECK-NEXT:    cneg x8, x8, mi
-; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    cset w9, lt
-; CHECK-NEXT:    whilelo p0.b, xzr, x8
-; CHECK-NEXT:    sbfx x8, x9, #0, #1
-; CHECK-NEXT:    whilelo p1.b, xzr, x8
-; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: whilerw_8:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    whilerw p0.b, x2, x1
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-NOSVE2-LABEL: whilerw_8:
+; CHECK-NOSVE2:       // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT:    subs x8, x2, x1
+; CHECK-NOSVE2-NEXT:    cneg x8, x8, mi
+; CHECK-NOSVE2-NEXT:    cmp x8, #1
+; CHECK-NOSVE2-NEXT:    cset w9, lt
+; CHECK-NOSVE2-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NOSVE2-NEXT:    sbfx x8, x9, #0, #1
+; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x8
+; CHECK-NOSVE2-NEXT:    sel p0.b, p0, p0.b, p1.b
+; CHECK-NOSVE2-NEXT:    ret
 entry:
   %b24 = ptrtoint ptr %b to i64
   %c25 = ptrtoint ptr %c to i64
@@ -115,26 +120,34 @@ entry:
 }
 
 define <vscale x 8 x i1> @whilerw_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilerw_16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    subs x8, x2, x1
-; CHECK-NEXT:    cneg x8, x8, mi
-; CHECK-NEXT:    cmp x8, #2
-; CHECK-NEXT:    cset w9, lt
-; CHECK-NEXT:    whilelo p0.h, xzr, x8
-; CHECK-NEXT:    sbfx x8, x9, #0, #1
-; CHECK-NEXT:    whilelo p1.h, xzr, x8
-; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: whilerw_16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    whilerw p0.h, x2, x1
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-NOSVE2-LABEL: whilerw_16:
+; CHECK-NOSVE2:       // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT:    subs x8, x2, x1
+; CHECK-NOSVE2-NEXT:    cneg x8, x8, mi
+; CHECK-NOSVE2-NEXT:    cmp x8, #2
+; CHECK-NOSVE2-NEXT:    add x8, x8, x8, lsr #63
+; CHECK-NOSVE2-NEXT:    cset w9, lt
+; CHECK-NOSVE2-NEXT:    sbfx x9, x9, #0, #1
+; CHECK-NOSVE2-NEXT:    asr x8, x8, #1
+; CHECK-NOSVE2-NEXT:    whilelo p0.h, xzr, x9
+; CHECK-NOSVE2-NEXT:    whilelo p1.h, xzr, x8
+; CHECK-NOSVE2-NEXT:    mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT:    ret
 entry:
   %b24 = ptrtoint ptr %b to i64
   %c25 = ptrtoint ptr %c to i64
   %sub.diff = sub i64 %c25, %b24
   %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
+  %diff = sdiv i64 %0, 2
   %neg.compare = icmp slt i64 %0, 2
   %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
   %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %0)
+  %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
   %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
   ret <vscale x 8 x i1> %active.lane.mask.alias
 }
@@ -173,26 +186,36 @@ entry:
 }
 
 define <vscale x 4 x i1> @whilerw_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilerw_32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    subs x8, x2, x1
-; CHECK-NEXT:    cneg x8, x8, mi
-; CHECK-NEXT:    cmp x8, #4
-; CHECK-NEXT:    cset w9, lt
-; CHECK-NEXT:    whilelo p0.s, xzr, x8
-; CHECK-NEXT:    sbfx x8, x9, #0, #1
-; CHECK-NEXT:    whilelo p1.s, xzr, x8
-; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: whilerw_32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    whilerw p0.s, x2, x1
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-NOSVE2-LABEL: whilerw_32:
+; CHECK-NOSVE2:       // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT:    subs x8, x2, x1
+; CHECK-NOSVE2-NEXT:    cneg x8, x8, mi
+; CHECK-NOSVE2-NEXT:    add x9, x8, #3
+; CHECK-NOSVE2-NEXT:    cmp x8, #0
+; CHECK-NOSVE2-NEXT:    csel x9, x9, x8, lt
+; CHECK-NOSVE2-NEXT:    cmp x8, #4
+; CHECK-NOSVE2-NEXT:    cset w8, lt
+; CHECK-NOSVE2-NEXT:    asr x9, x9, #2
+; CHECK-NOSVE2-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NOSVE2-NEXT:    whilelo p1.s, xzr, x9
+; CHECK-NOSVE2-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NOSVE2-NEXT:    mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT:    ret
 entry:
   %b24 = ptrtoint ptr %b to i64
   %c25 = ptrtoint ptr %c to i64
   %sub.diff = sub i64 %c25, %b24
   %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
+  %diff = sdiv i64 %0, 4
   %neg.compare = icmp slt i64 %0, 4
   %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
   %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %0)
+  %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
   %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
   ret <vscale x 4 x i1> %active.lane.mask.alias
 }
@@ -231,26 +254,36 @@ entry:
 }
 
 define <vscale x 2 x i1> @whilerw_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilerw_64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    subs x8, x2, x1
-; CHECK-NEXT:    cneg x8, x8, mi
-; CHECK-NEXT:    cmp x8, #8
-; CHECK-NEXT:    cset w9, lt
-; CHECK-NEXT:    whilelo p0.d, xzr, x8
-; CHECK-NEXT:    sbfx x8, x9, #0, #1
-; CHECK-NEXT:    whilelo p1.d, xzr, x8
-; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: whilerw_64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    whilerw p0.d, x2, x1
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-NOSVE2-LABEL: whilerw_64:
+; CHECK-NOSVE2:       // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT:    subs x8, x2, x1
+; CHECK-NOSVE2-NEXT:    cneg x8, x8, mi
+; CHECK-NOSVE2-NEXT:    add x9, x8, #7
+; CHECK-NOSVE2-NEXT:    cmp x8, #0
+; CHECK-NOSVE2-NEXT:    csel x9, x9, x8, lt
+; CHECK-NOSVE2-NEXT:    cmp x8, #8
+; CHECK-NOSVE2-NEXT:    cset w8, lt
+; CHECK-NOSVE2-NEXT:    asr x9, x9, #3
+; CHECK-NOSVE2-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NOSVE2-NEXT:    whilelo p1.d, xzr, x9
+; CHECK-NOSVE2-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NOSVE2-NEXT:    mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT:    ret
 entry:
   %b24 = ptrtoint ptr %b to i64
   %c25 = ptrtoint ptr %c to i64
   %sub.diff = sub i64 %c25, %b24
   %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
+  %diff = sdiv i64 %0, 8
   %neg.compare = icmp slt i64 %0, 8
   %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
   %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %0)
+  %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
   %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
   ret <vscale x 2 x i1> %active.lane.mask.alias
 }

>From 271807d907f4393be86de91c66e7b9d4451fbcf9 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 25 Oct 2024 15:58:12 +0100
Subject: [PATCH 5/5] Add getRTCheckStyle and useSafeEltsMask

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6ca5581683e824..622854d2efd1dd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1406,6 +1406,21 @@ class LoopVectorizationCostModel {
                                : ChosenTailFoldingStyle->second;
   }
 
+  RTCheckStyle getRTCheckStyle(TailFoldingStyle TFStyle) const {
+    switch (TFStyle) {
+    case TailFoldingStyle::Data:
+    case TailFoldingStyle::DataAndControlFlow:
+    case TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck:
+      return RTCheckStyle::UseSafeEltsMask;
+    default:
+      return RTCheckStyle::ScalarFallback;
+    }
+  }
+
+  RTCheckStyle getRTCheckStyle() const {
+    return getRTCheckStyle(getTailFoldingStyle());
+  }
+
   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
   /// overflow or not.
   /// \param IsScalableVF true if scalable vector factors enabled.
@@ -2118,6 +2133,10 @@ static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
 }
 
+static bool useSafeEltsMask(TailFoldingStyle TFStyle, RTCheckStyle Style) {
+  return useActiveLaneMask(TFStyle) && Style == RTCheckStyle::UseSafeEltsMask;
+}
+
 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
 // vectorization. The loop needs to be annotated with #pragma omp simd
 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
@@ -7126,7 +7145,9 @@ void LoopVectorizationPlanner::plan(
     return;
 
   ArrayRef<PointerDiffInfo> DiffChecks;
-  if (RTChecks.has_value() && useActiveLaneMask(CM.getTailFoldingStyle(true)))
+  auto TFStyle = CM.getTailFoldingStyle();
+  if (RTChecks.has_value() &&
+      useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle)))
     DiffChecks = *RTChecks;
 
   // Invalidate interleave groups if all blocks of loop will be predicated.