[llvm-branch-commits] [llvm] [LV] Mask off possibly aliasing vector lanes (PR #100579)

Fri Aug 15 03:46:28 PDT 2025

https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/100579

>From cf3bd5da6d5906b4709f9b056bbd6378376cf361 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 18 Oct 2024 15:49:22 +0100
Subject: [PATCH 01/13] [LV] Mask off possibly aliasing vector lanes

    When vectorising a loop that uses loads and stores, those pointers could
    overlap if their difference is less than the vector factor. For example,
    if address 20 is being stored to and address 23 is being loaded from, they
    overlap when the vector factor is 4 or higher. Currently LoopVectorize
    branches to a scalar loop in these cases with a runtime check. Howver if
    we construct a mask that disables the overlapping (aliasing) lanes then
    the vectorised loop can be safely entered, as long as the loads and
    stores are masked off.
---
 .../llvm/Analysis/LoopAccessAnalysis.h        |   5 +-
 .../llvm/Analysis/TargetTransformInfo.h       |   7 +
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |   5 +-
 llvm/lib/Transforms/Utils/LoopUtils.cpp       |   3 +-
 .../Vectorize/LoopVectorizationPlanner.h      |  22 ++-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 101 +++++++---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  52 +++++
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   4 +
 .../Vectorize/VPlanConstruction.cpp           |  30 +--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  62 +++++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  70 +++++--
 .../Transforms/Vectorize/VPlanTransforms.h    |  11 +-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   1 +
 .../LoopVectorize/AArch64/alias_mask.ll       | 185 ++++++++++++++++++
 .../AArch64/induction-costs-sve.ll            |  65 ++++--
 .../runtime-checks-difference.ll              |   5 +-
 .../vplan-printing-predicated.ll              | 115 +++++++++++
 17 files changed, 662 insertions(+), 81 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 92304edd67a44..5651b597a77fc 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -489,11 +489,12 @@ struct PointerDiffInfo {
   const SCEV *SinkStart;
   unsigned AccessSize;
   bool NeedsFreeze;
+  bool WriteAfterRead;
 
   PointerDiffInfo(const SCEV *SrcStart, const SCEV *SinkStart,
-                  unsigned AccessSize, bool NeedsFreeze)
+                  unsigned AccessSize, bool NeedsFreeze, bool WriteAfterRead)
       : SrcStart(SrcStart), SinkStart(SinkStart), AccessSize(AccessSize),
-        NeedsFreeze(NeedsFreeze) {}
+        NeedsFreeze(NeedsFreeze), WriteAfterRead(WriteAfterRead) {}
 };
 
 /// Holds information about the memory runtime legality checks to verify
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 53c91bfe16804..6e10fa4573182 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -201,6 +201,13 @@ enum class TailFoldingStyle {
   DataWithEVL,
 };
 
+enum class RTCheckStyle {
+  /// Branch to scalar loop if checks fails at runtime.
+  ScalarFallback,
+  /// Form a mask based on elements which won't be a WAR or RAW hazard.
+  UseSafeEltsMask,
+};
+
 struct TailFoldingInfo {
   TargetLibraryInfo *TLI;
   LoopVectorizationLegality *LVL;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index a5535339a714f..20ac57fe7d889 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -482,11 +482,14 @@ bool RuntimePointerChecking::tryToCreateDiffCheck(
     }
   }
 
+  bool WriteAfterRead = isa<LoadInst>(SrcInsts[0]);
+
   LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
                     << "SrcStart: " << *SrcStartInt << '\n'
                     << "SinkStartInt: " << *SinkStartInt << '\n');
   DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize,
-                          Src->NeedsFreeze || Sink->NeedsFreeze);
+                          Src->NeedsFreeze || Sink->NeedsFreeze,
+                          WriteAfterRead);
   return true;
 }
 
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 2d830f3b6f952..2d3e7f85ee3a8 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -2034,7 +2034,8 @@ Value *llvm::addDiffRuntimeChecks(
   // Map to keep track of created compares, The key is the pair of operands for
   // the compare, to allow detecting and re-using redundant compares.
   DenseMap<std::pair<Value *, Value *>, Value *> SeenCompares;
-  for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze] : Checks) {
+  for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze,
+                    WriteAfterRead] : Checks) {
     Type *Ty = SinkStart->getType();
     // Compute VF * IC * AccessSize.
     auto *VFTimesICTimesSize =
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 4856ebebb596f..b5d9d6e4b7837 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -493,7 +493,13 @@ class LoopVectorizationPlanner {
   /// Build VPlans for the specified \p UserVF and \p UserIC if they are
   /// non-zero or all applicable candidate VFs otherwise. If vectorization and
   /// interleaving should be avoided up-front, no plans are generated.
-  void plan(ElementCount UserVF, unsigned UserIC);
+  /// DiffChecks is a list of pointer pairs that should be checked for aliasing,
+  /// setting HasAliasMask to true in the case that an alias mask is generated
+  /// and the vector loop should be entered even if the pointers alias across a
+  /// loop iteration.
+  void plan(ElementCount UserVF, unsigned UserIC,
+            std::optional<ArrayRef<PointerDiffInfo>> DiffChecks,
+            bool &HasAliasMask);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
@@ -578,13 +584,23 @@ class LoopVectorizationPlanner {
   /// set the largest included VF to the maximum VF for which no plan could be
   /// built. Each VPlan is built starting from a copy of \p InitialPlan, which
   /// is a plain CFG VPlan wrapping the original scalar loop.
+  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
+  /// setting HasAliasMask to true in the case that an alias mask is generated
+  /// and the vector loop should be entered even if the pointers alias across a
+  /// loop iteration.
   VPlanPtr tryToBuildVPlanWithVPRecipes(VPlanPtr InitialPlan, VFRange &Range,
-                                        LoopVersioning *LVer);
+                                        LoopVersioning *LVer,
+                                        ArrayRef<PointerDiffInfo> RTChecks,
+                                        bool &HasAliasMask);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
-  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
+  /// RTChecks contains a list of pointer pairs that an alias mask should be
+  /// generated for.
+  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF,
+                                ArrayRef<PointerDiffInfo> RTChecks,
+                                bool &HasAliasMask);
 
   // Adjust the recipes for reductions. For in-loop reductions the chain of
   // instructions leading from the loop exit instr to the phi need to be
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d201fbccbd91d..3aa422d4fa16c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -178,6 +178,7 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
 STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
+STATISTIC(LoopsAliasMasked, "Number of loops predicated with an alias mask");
 
 static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -567,7 +568,11 @@ class InnerLoopVectorizer {
   /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
   /// vector preheader and its predecessor, also connecting the new block to the
   /// scalar preheader.
-  void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
+  /// If HasAliasMask is true then the vector loop will be branched to
+  /// unconditionally, instead of there being a conditional branch to the scalar
+  /// loop or vector loop
+  void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB,
+                                  bool HasAliasMask = false);
 
   /// The original loop.
   Loop *OrigLoop;
@@ -1348,6 +1353,21 @@ class LoopVectorizationCostModel {
                                : ChosenTailFoldingStyle->second;
   }
 
+  RTCheckStyle getRTCheckStyle(TailFoldingStyle TFStyle) const {
+    switch (TFStyle) {
+    case TailFoldingStyle::Data:
+    case TailFoldingStyle::DataAndControlFlow:
+    case TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck:
+      return RTCheckStyle::UseSafeEltsMask;
+    default:
+      return RTCheckStyle::ScalarFallback;
+    }
+  }
+
+  RTCheckStyle getRTCheckStyle() const {
+    return getRTCheckStyle(getTailFoldingStyle());
+  }
+
   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
   /// overflow or not.
   /// \param IsScalableVF true if scalable vector factors enabled.
@@ -1803,6 +1823,10 @@ class GeneratedRTChecks {
   TTI::TargetCostKind CostKind;
 
 public:
+  /// Set by VPlan when the vector loop should be entered even when runtime
+  /// checks determine that pointers alias within an iteration.
+  bool HasAliasMask = false;
+
   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
                     LoopInfo *LI, TargetTransformInfo *TTI,
                     const DataLayout &DL, TTI::TargetCostKind CostKind)
@@ -2064,6 +2088,10 @@ static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
 }
 
+static bool useSafeEltsMask(TailFoldingStyle TFStyle, RTCheckStyle Style) {
+  return useActiveLaneMask(TFStyle) && Style == RTCheckStyle::UseSafeEltsMask;
+}
+
 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
 // vectorization. The loop needs to be annotated with #pragma omp simd
 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
@@ -2273,7 +2301,7 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
   return TTI.enableMaskedInterleavedAccessVectorization();
 }
 
-void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
+void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB, bool HasAliasMask) {
   // Note: The block with the minimum trip-count check is already connected
   // during earlier VPlan construction.
   VPBlockBase *ScalarPH = Plan.getScalarPreheader();
@@ -2283,17 +2311,19 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
   VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
   VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
   PreVectorPH = CheckVPIRBB;
-  VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
-  PreVectorPH->swapSuccessors();
-
-  // We just connected a new block to the scalar preheader. Update all
-  // VPPhis by adding an incoming value for it, replicating the last value.
-  unsigned NumPredecessors = ScalarPH->getNumPredecessors();
-  for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
-    assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
-    assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
-           "must have incoming values for all operands");
-    R.addOperand(R.getOperand(NumPredecessors - 2));
+  if (!HasAliasMask) {
+      VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
+      PreVectorPH->swapSuccessors();
+
+      // We just connected a new block to the scalar preheader. Update all
+      // VPPhis by adding an incoming value for it, replicating the last value.
+      unsigned NumPredecessors = ScalarPH->getNumPredecessors();
+      for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
+        assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
+        assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
+               "must have incoming values for all operands");
+        R.addOperand(R.getOperand(NumPredecessors - 2));
+      }
   }
 }
 
@@ -2372,7 +2402,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
                                    static_cast<DominatorTree *>(nullptr), LI,
                                    nullptr, "vector.ph");
-
   BranchInst &BI =
       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
@@ -6697,7 +6726,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
   return VectorizationFactor::Disabled();
 }
 
-void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
+void LoopVectorizationPlanner::plan(
+    ElementCount UserVF, unsigned UserIC,
+    std::optional<ArrayRef<PointerDiffInfo>> RTChecks, bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
@@ -6706,6 +6737,12 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
     return;
 
+  ArrayRef<PointerDiffInfo> DiffChecks;
+  auto TFStyle = CM.getTailFoldingStyle();
+  if (RTChecks.has_value() &&
+      useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle)))
+    DiffChecks = *RTChecks;
+
   // Invalidate interleave groups if all blocks of loop will be predicated.
   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
       !useMaskedInterleavedAccesses(TTI)) {
@@ -6738,7 +6775,7 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.collectInLoopReductions();
       if (CM.selectUserVectorizationFactor(UserVF)) {
         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-        buildVPlansWithVPRecipes(UserVF, UserVF);
+        buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecks, HasAliasMask);
         LLVM_DEBUG(printPlans(dbgs()));
         return;
       }
@@ -6762,8 +6799,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
     CM.collectNonVectorizedAndSetWideningDecisions(VF);
   }
 
-  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
-  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
+  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
+                           DiffChecks, HasAliasMask);
+  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
+                           DiffChecks, HasAliasMask);
 
   LLVM_DEBUG(printPlans(dbgs()));
 }
@@ -8373,7 +8412,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
-                                                        ElementCount MaxVF) {
+    ElementCount MaxVF, ArrayRef<PointerDiffInfo> RTChecks,
+    bool &HasAliasMask) {
   if (ElementCount::isKnownGT(MinVF, MaxVF))
     return;
 
@@ -8399,7 +8439,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
     if (auto Plan = tryToBuildVPlanWithVPRecipes(
-            std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
+            std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer, RTChecks, HasAliasMask)) {
       bool HasScalarVF = Plan->hasScalarVFOnly();
       // Now optimize the initial VPlan.
       if (!HasScalarVF)
@@ -8624,7 +8664,7 @@ static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
 }
 
 VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
-    VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
+    VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer, ArrayRef<PointerDiffInfo> RTChecks, bool &HasAliasMask) {
 
   using namespace llvm::VPlanPatternMatch;
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
@@ -8913,7 +8953,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     bool WithoutRuntimeCheck =
         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
-                                       WithoutRuntimeCheck);
+                                       WithoutRuntimeCheck, PSE, RTChecks);
+    if (ForControlFlow && !RTChecks.empty())
+      HasAliasMask = true;
   }
   VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues);
 
@@ -9329,7 +9371,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
             CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
            "Cannot SCEV check stride or overflow when optimizing for size");
     VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
-                                      HasBranchWeights);
+                                      HasBranchWeights, RTChecks.HasAliasMask);
   }
   const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
   if (MemCheckBlock) {
@@ -9354,7 +9396,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
       });
     }
     VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock,
-                                      HasBranchWeights);
+                                      HasBranchWeights, RTChecks.HasAliasMask);
   }
 }
 
@@ -9485,6 +9527,7 @@ static bool processLoopInVPlanNativePath(
   // Mark the loop as already vectorized to avoid vectorizing again.
   Hints.setAlreadyVectorized();
   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
+
   return true;
 }
 
@@ -10099,15 +10142,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   ElementCount UserVF = Hints.getWidth();
   unsigned UserIC = Hints.getInterleave();
 
+  GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
+                           CM.CostKind);
+
   // Plan how to best vectorize.
-  LVP.plan(UserVF, UserIC);
+  LVP.plan(UserVF, UserIC,
+           LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(),
+           Checks.HasAliasMask);
   VectorizationFactor VF = LVP.computeBestVF();
+  if (Checks.HasAliasMask)
+    LoopsAliasMasked++;
   unsigned IC = 1;
 
   if (ORE->allowExtraAnalysis(LV_NAME))
     LVP.emitInvalidCostRemarks(ORE);
 
-  GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
   if (LVP.hasPlanWithVF(VF.Width)) {
     // Select the interleave count.
     IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 11a7d8b339ae9..39269eaf29add 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -529,6 +529,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     switch (R->getVPDefID()) {
     case VPRecipeBase::VPDerivedIVSC:
     case VPRecipeBase::VPEVLBasedIVPHISC:
+    case VPRecipeBase::VPAliasLaneMaskSC:
     case VPRecipeBase::VPExpandSCEVSC:
     case VPRecipeBase::VPExpressionSC:
     case VPRecipeBase::VPInstructionSC:
@@ -987,6 +988,7 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
     // during unrolling.
     ExtractPenultimateElement,
     LogicalAnd, // Non-poison propagating logical And.
+    PopCount,
     // Add an offset in bytes (second operand) to a base pointer (first
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
@@ -3206,6 +3208,56 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   }
 };
 
+// Given a pointer A that is being laoded from, and pointer B that is being
+// stored to, both with unknown lengths, create a mask that disables
+// elements which could overlap across a loop iteration. For example, if A
+// is X and B is X + 2 with VF being 4, only the first two elements of the
+// loaded vector can be stored since they don't overlap with the stored
+// vector. %a.vec = load %a ; = [s, t, u, v]
+// [...]
+// store %b, %a.vec ; only s and t can be stored as their addresses don't
+// overlap with %a + (VF - 1)
+class VPAliasLaneMaskRecipe : public VPSingleDefRecipe {
+
+public:
+  VPAliasLaneMaskRecipe(VPValue *Src, VPValue *Sink, unsigned ElementSize,
+                        bool WriteAfterRead)
+      : VPSingleDefRecipe(VPDef::VPAliasLaneMaskSC, {Src, Sink}),
+        ElementSize(ElementSize), WriteAfterRead(WriteAfterRead) {}
+
+  ~VPAliasLaneMaskRecipe() override = default;
+
+  VPAliasLaneMaskRecipe *clone() override {
+    return new VPAliasLaneMaskRecipe(getSourceValue(), getSinkValue(),
+                                     ElementSize, WriteAfterRead);
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPAliasLaneMaskSC);
+
+  void execute(VPTransformState &State) override;
+
+  /// Get the VPValue* for the pointer being read from
+  VPValue *getSourceValue() const { return getOperand(0); }
+
+  // Get the size of the element(s) accessed by the pointers
+  unsigned getAccessedElementSize() const { return ElementSize; }
+
+  /// Get the VPValue* for the pointer being stored to
+  VPValue *getSinkValue() const { return getOperand(1); }
+
+  bool isWriteAfterRead() const { return WriteAfterRead; }
+
+private:
+  unsigned ElementSize;
+  bool WriteAfterRead;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// Recipe to expand a SCEV expression.
 class VPExpandSCEVRecipe : public VPSingleDefRecipe {
   const SCEV *Expr;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index b39231f106300..0861a34ac3e42 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -134,6 +134,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   case VPInstruction::BranchOnCond:
   case VPInstruction::BranchOnCount:
     return Type::getVoidTy(Ctx);
+  case VPInstruction::PopCount:
+    return Type::getInt64Ty(Ctx);
   default:
     break;
   }
@@ -308,6 +310,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
           })
           .Case<VPExpressionRecipe>([this](const auto *R) {
             return inferScalarType(R->getOperandOfResultType());
+          }).Case<VPAliasLaneMaskRecipe>([this](const VPAliasLaneMaskRecipe *R) {
+            return Type::getInt1Ty(Ctx);
           });
 
   assert(ResultTy && "could not infer type for the given VPValue");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index b231a8429503f..4d8080df02930 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -638,24 +638,26 @@ static constexpr uint32_t CheckBypassWeights[] = {1, 127};
 
 void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
                                        BasicBlock *CheckBlock,
-                                       bool AddBranchWeights) {
-  VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
+                                       bool AddBranchWeights, bool HasAliasMask) {
   VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock);
   VPBlockBase *VectorPH = Plan.getVectorPreheader();
-  VPBlockBase *ScalarPH = Plan.getScalarPreheader();
   VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
   VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckBlockVPBB);
-  VPBlockUtils::connectBlocks(CheckBlockVPBB, ScalarPH);
-  CheckBlockVPBB->swapSuccessors();
-
-  // We just connected a new block to the scalar preheader. Update all
-  // VPPhis by adding an incoming value for it, replicating the last value.
-  unsigned NumPredecessors = ScalarPH->getNumPredecessors();
-  for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
-    assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
-    assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
-           "must have incoming values for all operands");
-    R.addOperand(R.getOperand(NumPredecessors - 2));
+  VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
+  if (!HasAliasMask) {
+    VPBlockBase *ScalarPH = Plan.getScalarPreheader();
+    VPBlockUtils::connectBlocks(CheckBlockVPBB, ScalarPH);
+    CheckBlockVPBB->swapSuccessors();
+
+    // We just connected a new block to the scalar preheader. Update all
+    // VPPhis by adding an incoming value for it, replicating the last value.
+    unsigned NumPredecessors = ScalarPH->getNumPredecessors();
+    for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
+      assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
+      assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
+             "must have incoming values for all operands");
+      R.addOperand(R.getOperand(NumPredecessors - 2));
+    }
   }
 
   VPIRMetadata VPBranchWeights;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 23c10d2b25263..2801297fc0bbb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -519,6 +519,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::ExplicitVectorLength:
   case VPInstruction::AnyOf:
   case VPInstruction::Not:
+  case VPInstruction::PopCount:
     return true;
   default:
     return false;
@@ -625,6 +626,29 @@ Value *VPInstruction::generate(VPTransformState &State) {
                                    {PredTy, ScalarTC->getType()},
                                    {VIVElem0, ScalarTC}, nullptr, Name);
   }
+  // Count the number of bits set in each lane and reduce the result to a scalar
+  case VPInstruction::PopCount: {
+    Value *Op = State.get(getOperand(0));
+    Type *VT = Op->getType();
+    Value *Cnt = Op;
+
+    // i1 vectors can just use the add reduction. Bigger elements need a ctpop
+    // first.
+    if (VT->getScalarSizeInBits() > 1)
+      Cnt = Builder.CreateIntrinsic(Intrinsic::ctpop, {VT}, {Cnt});
+
+    auto *VecVT = cast<VectorType>(VT);
+    // Extend to an i8 since i1 is too small to add with
+    if (VecVT->getElementType()->getScalarSizeInBits() < 8) {
+      Cnt = Builder.CreateCast(
+          Instruction::ZExt, Cnt,
+          VectorType::get(Builder.getInt8Ty(), VecVT->getElementCount()));
+    }
+
+    Cnt = Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, Cnt);
+    Cnt = Builder.CreateCast(Instruction::ZExt, Cnt, Builder.getInt64Ty());
+    return Cnt;
+  }
   case VPInstruction::FirstOrderRecurrenceSplice: {
     // Generate code to combine the previous and current values in vector v3.
     //
@@ -1033,7 +1057,7 @@ bool VPInstruction::isVectorToScalar() const {
          getOpcode() == VPInstruction::ComputeAnyOfResult ||
          getOpcode() == VPInstruction::ComputeFindIVResult ||
          getOpcode() == VPInstruction::ComputeReductionResult ||
-         getOpcode() == VPInstruction::AnyOf;
+         getOpcode() == VPInstruction::AnyOf || getOpcode() == PopCount;
 }
 
 bool VPInstruction::isSingleScalar() const {
@@ -1206,6 +1230,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ActiveLaneMask:
     O << "active lane mask";
     break;
+  case VPInstruction::PopCount:
+    O << "popcount";
+    break;
   case VPInstruction::ExplicitVectorLength:
     O << "EXPLICIT-VECTOR-LENGTH";
     break;
@@ -3747,6 +3774,39 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+void VPAliasLaneMaskRecipe::execute(VPTransformState &State) {
+  IRBuilderBase Builder = State.Builder;
+  Value *SinkValue = State.get(getSinkValue(), true);
+  Value *SourceValue = State.get(getSourceValue(), true);
+
+  unsigned IntrinsicID = WriteAfterRead
+                             ? Intrinsic::loop_dependence_war_mask
+                             : Intrinsic::loop_dependence_raw_mask;
+  Value *SourceAsPtr = Builder.CreateCast(Instruction::IntToPtr, SourceValue,
+                                          Builder.getPtrTy());
+  Value *SinkAsPtr =
+      Builder.CreateCast(Instruction::IntToPtr, SinkValue, Builder.getPtrTy());
+  Value *AliasMask = Builder.CreateIntrinsic(
+      IntrinsicID, {VectorType::get(Builder.getInt1Ty(), State.VF)},
+      {SourceAsPtr, SinkAsPtr, Builder.getInt64(getAccessedElementSize())},
+      nullptr, "alias.lane.mask");
+  State.set(this, AliasMask, /*IsScalar=*/false);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPAliasLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
+                                  VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+  getVPSingleValue()->printAsOperand(O, SlotTracker);
+  O << " = ALIAS-LANE-MASK ";
+  getSourceValue()->printAsOperand(O, SlotTracker);
+  O << ", ";
+  getSinkValue()->printAsOperand(O, SlotTracker);
+  O << " (" << (WriteAfterRead ? "write-after-read" : "read-after-write")
+    << ")";
+}
+#endif
+
 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
   assert(!State.Lane && "cannot be used in per-lane");
   const DataLayout &DL = SE.getDataLayout();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c019311b2faba..b227fcf0e83c8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1966,8 +1966,9 @@ void VPlanTransforms::optimize(VPlan &Plan) {
 //   %Negated = Not %ALM
 //   branch-on-cond %Negated
 //
-static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
-    VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
+static VPValue *addVPLaneMaskPhiAndUpdateExitBranch(
+    VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck,
+    PredicatedScalarEvolution &PSE, ArrayRef<PointerDiffInfo> RTChecks) {
   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
   auto *CanonicalIVPHI = Plan.getCanonicalIV();
@@ -1977,14 +1978,36 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
   // TODO: Check if dropping the flags is needed if
   // !DataAndControlFlowWithoutRuntimeCheck.
+  VPValue *IncVal = CanonicalIVIncrement->getOperand(1);
+  assert(IncVal != CanonicalIVPHI && "Unexpected operand order");
+
   CanonicalIVIncrement->dropPoisonGeneratingFlags();
   DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
+
   // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
   // we have to take unrolling into account. Each part needs to start at
   //   Part * VF
   auto *VecPreheader = Plan.getVectorPreheader();
   VPBuilder Builder(VecPreheader);
 
+  // Create an alias mask for each possibly-aliasing pointer pair. If there
+  // are multiple they are combined together with ANDs.
+  VPValue *AliasMask = nullptr;
+
+  for (auto C : RTChecks) {
+    VPValue *Sink =
+        vputils::getOrCreateVPValueForSCEVExpr(Plan, C.SinkStart, *PSE.getSE());
+    VPValue *Src =
+        vputils::getOrCreateVPValueForSCEVExpr(Plan, C.SrcStart, *PSE.getSE());
+    VPAliasLaneMaskRecipe *M =
+        new VPAliasLaneMaskRecipe(Src, Sink, C.AccessSize, C.WriteAfterRead);
+    VecPreheader->appendRecipe(M);
+    if (AliasMask)
+      AliasMask = Builder.createAnd(AliasMask, M);
+    else
+      AliasMask = M;
+  }
+
   // Create the ActiveLaneMask instruction using the correct start values.
   VPValue *TC = Plan.getTripCount();
 
@@ -2008,14 +2031,35 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       "index.part.next");
 
   // Create the active lane mask instruction in the VPlan preheader.
-  auto *EntryALM =
+  VPValue *Mask =
       Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
                            DL, "active.lane.mask.entry");
 
   // Now create the ActiveLaneMaskPhi recipe in the main loop using the
   // preheader ActiveLaneMask instruction.
-  auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
+  auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(Mask, DebugLoc());
   LaneMaskPhi->insertAfter(CanonicalIVPHI);
+  VPValue *LaneMask = LaneMaskPhi;
+  if (AliasMask) {
+    // Increment phi by correct amount.
+    Builder.setInsertPoint(CanonicalIVIncrement);
+
+    VPValue *IncrementBy = Builder.createNaryOp(VPInstruction::PopCount,
+                                                {AliasMask}, DL, "popcount");
+    Type *IVType = CanonicalIVPHI->getScalarType();
+
+    if (IVType->getScalarSizeInBits() < 64)
+      IncrementBy =
+          Builder.createScalarCast(Instruction::Trunc, IncrementBy, IVType,
+                                   CanonicalIVIncrement->getDebugLoc());
+    CanonicalIVIncrement->setOperand(1, IncrementBy);
+
+    // And the alias mask so the iteration only processes non-aliasing lanes
+    Builder.setInsertPoint(CanonicalIVPHI->getParent(),
+                           CanonicalIVPHI->getParent()->getFirstNonPhi());
+    LaneMask = Builder.createNaryOp(Instruction::BinaryOps::And,
+                                    {LaneMaskPhi, AliasMask}, DL);
+  }
 
   // Create the active lane mask for the next iteration of the loop before the
   // original terminator.
@@ -2034,7 +2078,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
   auto *NotMask = Builder.createNot(ALM, DL);
   Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
   OriginalTerminator->eraseFromParent();
-  return LaneMaskPhi;
+  return LaneMask;
 }
 
 /// Collect all VPValues representing a header mask through the (ICMP_ULE,
@@ -2084,23 +2128,25 @@ static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
 
 void VPlanTransforms::addActiveLaneMask(
     VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
-    bool DataAndControlFlowWithoutRuntimeCheck) {
+    bool DataAndControlFlowWithoutRuntimeCheck, PredicatedScalarEvolution &PSE,
+    ArrayRef<PointerDiffInfo> RTChecks) {
+
   assert((!DataAndControlFlowWithoutRuntimeCheck ||
           UseActiveLaneMaskForControlFlow) &&
          "DataAndControlFlowWithoutRuntimeCheck implies "
          "UseActiveLaneMaskForControlFlow");
 
-  auto *FoundWidenCanonicalIVUser =
-      find_if(Plan.getCanonicalIV()->users(),
-              [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
-  assert(FoundWidenCanonicalIVUser &&
+  auto IVUsers = Plan.getCanonicalIV()->users();
+  auto *FoundWidenCanonicalIVUser = find_if(
+      IVUsers, [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
+  assert(FoundWidenCanonicalIVUser != IVUsers.end() &&
          "Must have widened canonical IV when tail folding!");
   auto *WideCanonicalIV =
       cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
-  VPSingleDefRecipe *LaneMask;
+  VPValue *LaneMask;
   if (UseActiveLaneMaskForControlFlow) {
     LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
-        Plan, DataAndControlFlowWithoutRuntimeCheck);
+        Plan, DataAndControlFlowWithoutRuntimeCheck, PSE, RTChecks);
   } else {
     VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
     LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 5de1483783a42..4e3c2c82ef2fb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -89,9 +89,9 @@ struct VPlanTransforms {
 
   /// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
   /// VPValue and connect the block to \p Plan, using the VPValue as branch
-  /// condition.
+  /// condition. If an alias mask has been set up then the checkblock won't branch to the scalar preheader.
   static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock,
-                               bool AddBranchWeights);
+                               bool AddBranchWeights, bool HasAliasMask);
 
   /// Replaces the VPInstructions in \p Plan with corresponding
   /// widen recipes. Returns false if any VPInstructions could not be converted
@@ -155,9 +155,14 @@ struct VPlanTransforms {
   /// creation) and instead it is handled using active-lane-mask. \p
   /// DataAndControlFlowWithoutRuntimeCheck implies \p
   /// UseActiveLaneMaskForControlFlow.
+  /// PSE is the SCEV expander used for values in runtime checks.
+  /// RTChecks refers to the pointer pairs that need aliasing elements to be
+  /// masked off each loop iteration.
   static void addActiveLaneMask(VPlan &Plan,
                                 bool UseActiveLaneMaskForControlFlow,
-                                bool DataAndControlFlowWithoutRuntimeCheck);
+                                bool DataAndControlFlowWithoutRuntimeCheck,
+                                PredicatedScalarEvolution &PSE,
+                                ArrayRef<PointerDiffInfo> RTChecks);
 
   /// Insert truncates and extends for any truncated recipe. Redundant casts
   /// will be folded later.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 24f6d61512ef6..be73a4b72965b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -331,6 +331,7 @@ class VPDef {
   using VPRecipeTy = enum {
     VPBranchOnMaskSC,
     VPDerivedIVSC,
+    VPAliasLaneMaskSC,
     VPExpandSCEVSC,
     VPExpressionSC,
     VPIRInstructionSC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
new file mode 100644
index 0000000000000..4f1696b7d7e1f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --filter-out-after "^scalar.ph:" --version 4
+; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s
+
+define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define dso_local void @alias_mask(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C2:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[C2]], [[B3]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP7]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP6]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP5]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[B2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[C1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT6]], i32 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+; CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP14]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP30]], ptr [[TMP31]], i64 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP8]], 16
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[TMP15]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt i64 [[N]], [[TMP15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i64 [[TMP11]], i64 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP25]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP25]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD5]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP20]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP23:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP28]])
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP23]] to i64
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP24]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP13]])
+; CHECK-NEXT:    [[TMP26:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <vscale x 16 x i1> [[TMP26]], i32 0
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+;
+entry:
+  %cmp11 = icmp sgt i64 %n, 0
+  br i1 %cmp11, label %for.body, label %exit
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %add = add i8 %load.b, %load.a
+  %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv
+  store i8 %add, ptr %gep.c, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                 ; preds = %for.body, %entry
+  ret void
+}
+
+define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define i32 @alias_mask_read_after_write(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C2:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[CMP19:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[B2]], [[C3]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP8]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP7]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[C2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[B1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT6]], i32 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP28]] to ptr
+; CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP15]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.raw.mask.nxv4i1(ptr [[TMP34]], ptr [[TMP35]], i64 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[N]], [[TMP16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP16]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 2, <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP20]], i32 2, <vscale x 4 x i1> [[TMP31]])
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP22]], i32 2, <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP23:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 4 x i32> [[TMP23]], [[WIDE_MASKED_LOAD5]]
+; CHECK-NEXT:    [[TMP25]] = select <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP32:%.*]] = zext <vscale x 4 x i1> [[ALIAS_LANE_MASK]] to <vscale x 4 x i8>
+; CHECK-NEXT:    [[TMP26:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP32]])
+; CHECK-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP26]] to i64
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP27]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
+; CHECK-NEXT:    [[TMP29:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 4 x i1> [[TMP29]], i32 0
+; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+;
+entry:
+  %cmp19 = icmp sgt i64 %n, 0
+  br i1 %cmp19, label %for.body, label %exit
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add2, %for.body ]
+  %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv
+  %load.a = load i32, ptr %gep.a, align 2
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %load.a, ptr %gep.c, align 2
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %load.b = load i32, ptr %gep.b, align 2
+  %add = add i32 %load.a, %accum
+  %add2 = add i32 %add, %load.b
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                        ; preds = %entry, %for.body
+  %result = phi i32 [ 0, %entry ], [ %add2, %for.body ]
+  ret i32 %result
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 3cef1f6e03ff9..77eeac18e9b1d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -28,6 +28,8 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
 ; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP10]]
 ; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[X]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[TMP13:%.*]] = trunc <vscale x 8 x i32> [[BROADCAST_SPLAT]] to <vscale x 8 x i16>
@@ -58,7 +60,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP38]], i64 [[TMP42]]
 ; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP36]], ptr [[TMP38]], align 1
 ; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP37]], ptr [[TMP43]], align 1
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
 ; DEFAULT-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
@@ -90,19 +92,36 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:  [[ENTRY:.*]]:
 ; PRED-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; PRED-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; PRED-NEXT:    [[SRC3:%.*]] = ptrtoint ptr [[SRC]] to i64
+; PRED-NEXT:    [[DST2:%.*]] = ptrtoint ptr [[DST]] to i64
 ; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; PRED-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; PRED:       [[VECTOR_MEMCHECK]]:
 ; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16
-; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST2]], [[SRC3]]
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
-; PRED-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; PRED-NEXT:    br label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
 ; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
+; PRED-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP5]], 1
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP8]]
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[SRC2]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[DST1]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[X]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT6]], i32 0
+; PRED-NEXT:    [[TMP33:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT1]], i32 0
+; PRED-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+; PRED-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP25]] to ptr
+; PRED-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP34]], ptr [[TMP35]], i64 1)
 ; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
@@ -113,7 +132,8 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; PRED:       [[VECTOR_BODY]]:
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK1]], [[ALIAS_LANE_MASK]]
 ; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
 ; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; PRED-NEXT:    [[TMP17:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
@@ -123,16 +143,20 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP21:%.*]] = lshr <vscale x 16 x i16> [[TMP20]], splat (i16 1)
 ; PRED-NEXT:    [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP21]] to <vscale x 16 x i8>
 ; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP26]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
+; PRED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
+; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP27]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    [[TMP30:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i8>
+; PRED-NEXT:    [[TMP31:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP30]])
+; PRED-NEXT:    [[TMP32:%.*]] = zext i8 [[TMP31]] to i64
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP32]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT:    [[TMP27:%.*]] = xor i1 [[TMP25]], true
-; PRED-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PRED-NEXT:    [[TMP28:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 16 x i1> [[TMP28]], i32 0
+; PRED-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
 ; PRED:       [[SCALAR_PH]]:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; PRED-NEXT:    br label %[[LOOP:.*]]
 ; PRED:       [[LOOP]]:
 ; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -262,6 +286,9 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
 ; PRED-NEXT:    br i1 [[TMP12]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 2
 ; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 2
 ; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
@@ -293,9 +320,9 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED:       [[PRED_STORE_CONTINUE2]]:
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT:    [[TMP25:%.*]] = xor i1 [[TMP24]], true
+; PRED-NEXT:    [[TMP24:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
 ; PRED-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; PRED-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[TMP24]], i32 0
 ; PRED-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
@@ -430,6 +457,9 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
 ; PRED-NEXT:    br i1 [[TMP13]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP14:%.*]] = sub i64 [[TMP0]], 4
 ; PRED-NEXT:    [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4
 ; PRED-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
@@ -482,9 +512,9 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED:       [[PRED_STORE_CONTINUE7]]:
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]])
-; PRED-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT:    [[TMP36:%.*]] = xor i1 [[TMP35]], true
+; PRED-NEXT:    [[TMP35:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
 ; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; PRED-NEXT:    [[TMP36:%.*]] = extractelement <4 x i1> [[TMP35]], i32 0
 ; PRED-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
@@ -621,6 +651,9 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
 ; PRED-NEXT:    br i1 [[TMP12]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 4
 ; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 4
 ; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
@@ -673,9 +706,9 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED:       [[PRED_STORE_CONTINUE6]]:
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT:    [[TMP35:%.*]] = xor i1 [[TMP34]], true
+; PRED-NEXT:    [[TMP34:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
 ; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; PRED-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP34]], i32 0
 ; PRED-NEXT:    br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
index b640c1911cb0d..e48979c4532a6 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
@@ -339,12 +339,13 @@ exit:
 ; of the outer loop as start value. It is sufficient to subtract the start
 ; values (%dst, %src) of the outer AddRecs.
 define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %m, i64 noundef %n) {
+;
 ; CHECK-LABEL: define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(
 ; CHECK-SAME: ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i64 noundef [[M:%.*]], i64 noundef [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; CHECK-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
-; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[DST1]], [[SRC2]]
 ; CHECK-NEXT:    br label %[[OUTER_LOOP:.*]]
 ; CHECK:       [[OUTER_LOOP]]:
 ; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ]
@@ -352,7 +353,7 @@ define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr noca
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
-; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[SUB]], 16
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], [[SCALAR_PH]], [[VECTOR_PH:label %.*]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll
new file mode 100644
index 0000000000000..d3c7c9c14802c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll
@@ -0,0 +1,115 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-tail-folding-style=data-and-control -disable-output %s 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Tests for printing predicated VPlans.
+
+define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: 'alias_mask'
+; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF
+; CHECK-NEXT: vp<%3> = original trip-count
+; CHECK-EMPTY: 
+; CHECK-NEXT: ir-bb<for.body.preheader>:
+; CHECK-NEXT:   IR   %wide.trip.count = zext nneg i32 %n to i64
+; CHECK-NEXT:   EMIT vp<%3> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:   EMIT vp<%4> = EXPAND SCEV (ptrtoint ptr %c to i64)
+; CHECK-NEXT:   EMIT vp<%5> = EXPAND SCEV (ptrtoint ptr %b to i64)
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY: 
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   EMIT vp<%6> = ALIAS-LANE-MASK vp<%5>, vp<%4> (write-after-read)
+; CHECK-NEXT:   EMIT vp<%index.part.next> = VF * Part + ir<0>
+; CHECK-NEXT:   EMIT vp<%active.lane.mask.entry> = active lane mask vp<%index.part.next>, vp<%3>
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY: 
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%7> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     ACTIVE-LANE-MASK-PHI vp<%8> = phi vp<%active.lane.mask.entry>, vp<%active.lane.mask.next>
+; CHECK-NEXT:     EMIT vp<%9> = and vp<%8>, vp<%6>
+; CHECK-NEXT:   Successor(s): pred.store
+; CHECK-EMPTY: 
+; CHECK-NEXT:   <xVFxUF> pred.store: {
+; CHECK-NEXT:     pred.store.entry:
+; CHECK-NEXT:       BRANCH-ON-MASK vp<%9>
+; CHECK-NEXT:     Successor(s): pred.store.if, pred.store.continue
+; CHECK-EMPTY: 
+; CHECK-NEXT:     pred.store.if:
+; CHECK-NEXT:       vp<%10> = SCALAR-STEPS vp<%7>, ir<1>, vp<%0>
+; CHECK-NEXT:       REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%10>
+; CHECK-NEXT:       REPLICATE ir<%0> = load ir<%arrayidx>
+; CHECK-NEXT:       REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<%10>
+; CHECK-NEXT:       REPLICATE ir<%1> = load ir<%arrayidx2>
+; CHECK-NEXT:       REPLICATE ir<%arrayidx6> = getelementptr inbounds ir<%c>, vp<%10>
+; CHECK-NEXT:       REPLICATE ir<%add> = add ir<%1>, ir<%0>
+; CHECK-NEXT:       REPLICATE store ir<%add>, ir<%arrayidx6>
+; CHECK-NEXT:     Successor(s): pred.store.continue
+; CHECK-EMPTY: 
+; CHECK-NEXT:     pred.store.continue:
+; CHECK-NEXT:     No successors
+; CHECK-NEXT:   }
+; CHECK-NEXT:   Successor(s): for.body.2
+; CHECK-EMPTY: 
+; CHECK-NEXT:   for.body.2:
+; CHECK-NEXT:     EMIT vp<%popcount> = popcount vp<%6>
+; CHECK-NEXT:     EMIT vp<%index.next> = add vp<%7>, vp<%popcount>
+; CHECK-NEXT:     EMIT vp<%11> = VF * Part + vp<%index.next>
+; CHECK-NEXT:     EMIT vp<%active.lane.mask.next> = active lane mask vp<%11>, vp<%3>
+; CHECK-NEXT:     EMIT vp<%12> = not vp<%active.lane.mask.next>
+; CHECK-NEXT:     EMIT branch-on-cond vp<%12>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY: 
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>
+; CHECK-EMPTY: 
+; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT-SCALAR vp<%bc.resume.val> = phi [ ir<0>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT: Successor(s): ir-bb<for.body>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.body>:
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:   IR   %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+; CHECK-NEXT:   IR   %0 = load i8, ptr %arrayidx, align 1
+; CHECK-NEXT:   IR   %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+; CHECK-NEXT:   IR   %1 = load i8, ptr %arrayidx2, align 1
+; CHECK-NEXT:   IR   %add = add i8 %1, %0
+; CHECK-NEXT:   IR   %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
+; CHECK-NEXT:   IR   store i8 %add, ptr %arrayidx6, align 1
+; CHECK-NEXT:   IR   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT:   IR   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %add = add i8 %1, %0
+  %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
+  store i8 %add, ptr %arrayidx6, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}

>From bccfac0519d674b6475fdea127c49d87310a7d97 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 29 Jul 2025 10:48:25 +0100
Subject: [PATCH 02/13] Check popcount of mask before entering vector body

---
 .../llvm/Analysis/TargetTransformInfo.h       |   4 +-
 .../include/llvm/Transforms/Utils/LoopUtils.h |   8 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |   5 +
 llvm/lib/Transforms/Utils/LoopUtils.cpp       |  80 +++++++---
 .../Vectorize/LoopVectorizationPlanner.h      |  24 ++-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 141 +++++++++++-------
 llvm/lib/Transforms/Vectorize/VPlan.h         |   2 +-
 .../Vectorize/VPlanConstruction.cpp           |  30 ++--
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   8 +-
 .../Transforms/Vectorize/VPlanTransforms.h    |   4 +-
 .../LoopVectorize/AArch64/alias_mask.ll       | 113 ++++----------
 .../vplan-printing-predicated.ll              |  18 +--
 12 files changed, 230 insertions(+), 207 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 6e10fa4573182..77d74ce75c47b 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -202,8 +202,8 @@ enum class TailFoldingStyle {
 };
 
 enum class RTCheckStyle {
-  /// Branch to scalar loop if checks fails at runtime.
-  ScalarFallback,
+  /// Create runtime checks based on the difference between two pointers
+  ScalarDifference,
   /// Form a mask based on elements which won't be a WAR or RAW hazard.
   UseSafeEltsMask,
 };
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 723f6aea1b76f..e9515c32b0603 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -569,9 +569,11 @@ addRuntimeChecks(Instruction *Loc, Loop *TheLoop,
                  const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,
                  SCEVExpander &Expander, bool HoistRuntimeChecks = false);
 
-LLVM_ABI Value *addDiffRuntimeChecks(
-    Instruction *Loc, ArrayRef<PointerDiffInfo> Checks, SCEVExpander &Expander,
-    function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC);
+LLVM_ABI Value *
+addDiffRuntimeChecks(Instruction *Loc, ArrayRef<PointerDiffInfo> Checks,
+                     SCEVExpander &Expander,
+                     function_ref<Value *(IRBuilderBase &, unsigned)> GetVF,
+                     unsigned IC, ElementCount VF, bool UseSafeEltsMask);
 
 /// Struct to hold information about a partially invariant condition.
 struct IVConditionInfo {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ee9cb177e7f50..0b6f6ef447c77 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -989,6 +989,11 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     }
     break;
   }
+  case Intrinsic::loop_dependence_raw_mask:
+  case Intrinsic::loop_dependence_war_mask:
+    if (ST->hasSVE2())
+      return 1;
+    return InstructionCost::getInvalid(CostKind);
   default:
     break;
   }
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 2d3e7f85ee3a8..248f6fa7d7f5e 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -2022,7 +2022,8 @@ Value *llvm::addRuntimeChecks(
 
 Value *llvm::addDiffRuntimeChecks(
     Instruction *Loc, ArrayRef<PointerDiffInfo> Checks, SCEVExpander &Expander,
-    function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC) {
+    function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC,
+    ElementCount VF, bool UseSafeEltsMask) {
 
   LLVMContext &Ctx = Loc->getContext();
   IRBuilder ChkBuilder(Ctx, InstSimplifyFolder(Loc->getDataLayout()));
@@ -2034,33 +2035,68 @@ Value *llvm::addDiffRuntimeChecks(
   // Map to keep track of created compares, The key is the pair of operands for
   // the compare, to allow detecting and re-using redundant compares.
   DenseMap<std::pair<Value *, Value *>, Value *> SeenCompares;
+  Value *AliasLaneMask = nullptr;
   for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze,
                     WriteAfterRead] : Checks) {
     Type *Ty = SinkStart->getType();
-    // Compute VF * IC * AccessSize.
-    auto *VFTimesICTimesSize =
-        ChkBuilder.CreateMul(GetVF(ChkBuilder, Ty->getScalarSizeInBits()),
-                             ConstantInt::get(Ty, IC * AccessSize));
-    Value *Diff =
-        Expander.expandCodeFor(SE.getMinusSCEV(SinkStart, SrcStart), Ty, Loc);
-
-    // Check if the same compare has already been created earlier. In that case,
-    // there is no need to check it again.
-    Value *IsConflict = SeenCompares.lookup({Diff, VFTimesICTimesSize});
-    if (IsConflict)
-      continue;
+    if (!VF.isScalar() && UseSafeEltsMask) {
+      Value *Sink = Expander.expandCodeFor(SinkStart, Ty, Loc);
+      Value *Src = Expander.expandCodeFor(SrcStart, Ty, Loc);
+      unsigned IntOpc = WriteAfterRead ? Intrinsic::loop_dependence_war_mask
+                                       : Intrinsic::loop_dependence_raw_mask;
+      Value *SourceAsPtr = ChkBuilder.CreateCast(Instruction::IntToPtr, Src,
+                                                 ChkBuilder.getPtrTy());
+      Value *SinkAsPtr = ChkBuilder.CreateCast(Instruction::IntToPtr, Sink,
+                                               ChkBuilder.getPtrTy());
+      Value *M = ChkBuilder.CreateIntrinsic(
+          IntOpc, {VectorType::get(ChkBuilder.getInt1Ty(), VF)},
+          {SourceAsPtr, SinkAsPtr, ChkBuilder.getInt64(AccessSize)}, nullptr,
+          "alias.lane.mask");
+      if (AliasLaneMask)
+        M = ChkBuilder.CreateAnd(AliasLaneMask, M);
+      else
+        AliasLaneMask = M;
+    } else {
+      // Compute VF * IC * AccessSize.
+      auto *VFTimesICTimesSize =
+          ChkBuilder.CreateMul(GetVF(ChkBuilder, Ty->getScalarSizeInBits()),
+                               ConstantInt::get(Ty, IC * AccessSize));
+      Value *Diff =
+          Expander.expandCodeFor(SE.getMinusSCEV(SinkStart, SrcStart), Ty, Loc);
+
+      // Check if the same compare has already been created earlier. In that
+      // case, there is no need to check it again.
+      Value *IsConflict = SeenCompares.lookup({Diff, VFTimesICTimesSize});
+      if (IsConflict)
+        continue;
 
-    IsConflict =
-        ChkBuilder.CreateICmpULT(Diff, VFTimesICTimesSize, "diff.check");
-    SeenCompares.insert({{Diff, VFTimesICTimesSize}, IsConflict});
-    if (NeedsFreeze)
-      IsConflict =
-          ChkBuilder.CreateFreeze(IsConflict, IsConflict->getName() + ".fr");
-    if (MemoryRuntimeCheck) {
       IsConflict =
-          ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
+          ChkBuilder.CreateICmpULT(Diff, VFTimesICTimesSize, "diff.check");
+      SeenCompares.insert({{Diff, VFTimesICTimesSize}, IsConflict});
+      if (NeedsFreeze)
+        IsConflict =
+            ChkBuilder.CreateFreeze(IsConflict, IsConflict->getName() + ".fr");
+      if (MemoryRuntimeCheck) {
+        IsConflict =
+            ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
+      }
+      MemoryRuntimeCheck = IsConflict;
     }
-    MemoryRuntimeCheck = IsConflict;
+  }
+
+  if (AliasLaneMask) {
+    auto *VecVT = VectorType::get(ChkBuilder.getInt1Ty(), VF);
+    // Extend to an i8 since i1 is too small to add with
+    Value *PopCount = ChkBuilder.CreateCast(
+        Instruction::ZExt, AliasLaneMask,
+        VectorType::get(ChkBuilder.getInt8Ty(), VecVT->getElementCount()));
+
+    PopCount =
+        ChkBuilder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, PopCount);
+    PopCount = ChkBuilder.CreateCast(Instruction::ZExt, PopCount,
+                                     ChkBuilder.getInt64Ty());
+    MemoryRuntimeCheck = ChkBuilder.CreateICmpUGT(
+        PopCount, ConstantInt::get(ChkBuilder.getInt64Ty(), 0));
   }
 
   return MemoryRuntimeCheck;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index b5d9d6e4b7837..4911d2b69e8f8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -494,12 +494,10 @@ class LoopVectorizationPlanner {
   /// non-zero or all applicable candidate VFs otherwise. If vectorization and
   /// interleaving should be avoided up-front, no plans are generated.
   /// DiffChecks is a list of pointer pairs that should be checked for aliasing,
-  /// setting HasAliasMask to true in the case that an alias mask is generated
-  /// and the vector loop should be entered even if the pointers alias across a
-  /// loop iteration.
+  /// combining the resulting predicate with an active lane mask if one is in
+  /// use.
   void plan(ElementCount UserVF, unsigned UserIC,
-            std::optional<ArrayRef<PointerDiffInfo>> DiffChecks,
-            bool &HasAliasMask);
+            std::optional<ArrayRef<PointerDiffInfo>> DiffChecks);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
@@ -585,22 +583,20 @@ class LoopVectorizationPlanner {
   /// built. Each VPlan is built starting from a copy of \p InitialPlan, which
   /// is a plain CFG VPlan wrapping the original scalar loop.
   /// RTChecks is a list of pointer pairs that should be checked for aliasing,
-  /// setting HasAliasMask to true in the case that an alias mask is generated
-  /// and the vector loop should be entered even if the pointers alias across a
-  /// loop iteration.
+  /// combining the resulting predicate with an active lane mask if one is in
+  /// use.
   VPlanPtr tryToBuildVPlanWithVPRecipes(VPlanPtr InitialPlan, VFRange &Range,
                                         LoopVersioning *LVer,
-                                        ArrayRef<PointerDiffInfo> RTChecks,
-                                        bool &HasAliasMask);
+                                        ArrayRef<PointerDiffInfo> RTChecks);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
-  /// RTChecks contains a list of pointer pairs that an alias mask should be
-  /// generated for.
+  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
+  /// combining the resulting predicate with an active lane mask if one is in
+  /// use.
   void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF,
-                                ArrayRef<PointerDiffInfo> RTChecks,
-                                bool &HasAliasMask);
+                                ArrayRef<PointerDiffInfo> RTChecks);
 
   // Adjust the recipes for reductions. For in-loop reductions the chain of
   // instructions leading from the loop exit instr to the phi need to be
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3aa422d4fa16c..806ce5cba07f9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -568,11 +568,7 @@ class InnerLoopVectorizer {
   /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
   /// vector preheader and its predecessor, also connecting the new block to the
   /// scalar preheader.
-  /// If HasAliasMask is true then the vector loop will be branched to
-  /// unconditionally, instead of there being a conditional branch to the scalar
-  /// loop or vector loop
-  void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB,
-                                  bool HasAliasMask = false);
+  void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
 
   /// The original loop.
   Loop *OrigLoop;
@@ -1360,7 +1356,7 @@ class LoopVectorizationCostModel {
     case TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck:
       return RTCheckStyle::UseSafeEltsMask;
     default:
-      return RTCheckStyle::ScalarFallback;
+      return RTCheckStyle::ScalarDifference;
     }
   }
 
@@ -1823,10 +1819,6 @@ class GeneratedRTChecks {
   TTI::TargetCostKind CostKind;
 
 public:
-  /// Set by VPlan when the vector loop should be entered even when runtime
-  /// checks determine that pointers alias within an iteration.
-  bool HasAliasMask = false;
-
   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
                     LoopInfo *LI, TargetTransformInfo *TTI,
                     const DataLayout &DL, TTI::TargetCostKind CostKind)
@@ -1840,7 +1832,8 @@ class GeneratedRTChecks {
   /// there is no vector code generation, the check blocks are removed
   /// completely.
   void create(Loop *L, const LoopAccessInfo &LAI,
-              const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
+              const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC,
+              bool UseSafeEltsMask) {
 
     // Hard cutoff to limit compile-time increase in case a very large number of
     // runtime checks needs to be generated.
@@ -1888,7 +1881,7 @@ class GeneratedRTChecks {
                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
               return RuntimeVF;
             },
-            IC);
+            IC, VF, UseSafeEltsMask);
       } else {
         MemRuntimeCheckCond = addRuntimeChecks(
             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
@@ -2301,7 +2294,57 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
   return TTI.enableMaskedInterleavedAccessVectorization();
 }
 
-void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB, bool HasAliasMask) {
+Value *
+InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
+  if (VectorTripCount)
+    return VectorTripCount;
+
+  Value *TC = getTripCount();
+  IRBuilder<> Builder(InsertBlock->getTerminator());
+
+  Type *Ty = TC->getType();
+  // This is where we can make the step a runtime constant.
+  Value *Step = createStepForVF(Builder, Ty, VF, UF);
+
+  // If the tail is to be folded by masking, round the number of iterations N
+  // up to a multiple of Step instead of rounding down. This is done by first
+  // adding Step-1 and then rounding down. Note that it's ok if this addition
+  // overflows: the vector induction variable will eventually wrap to zero given
+  // that it starts at zero and its Step is a power of two; the loop will then
+  // exit, with the last early-exit vector comparison also producing all-true.
+  // For scalable vectors the VF is not guaranteed to be a power of 2, but this
+  // is accounted for in emitIterationCountCheck that adds an overflow check.
+  if (Cost->foldTailByMasking()) {
+    assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
+           "VF*UF must be a power of 2 when folding tail by masking");
+    TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
+                           "n.rnd.up");
+  }
+
+  // Now we need to generate the expression for the part of the loop that the
+  // vectorized body will execute. This is equal to N - (N % Step) if scalar
+  // iterations are not required for correctness, or N - Step, otherwise. Step
+  // is equal to the vectorization factor (number of SIMD elements) times the
+  // unroll factor (number of SIMD instructions).
+  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
+
+  // There are cases where we *must* run at least one iteration in the remainder
+  // loop.  See the cost model for when this can happen.  If the step evenly
+  // divides the trip count, we set the remainder to be equal to the step. If
+  // the step does not evenly divide the trip count, no adjustment is necessary
+  // since there will already be scalar iterations. Note that the minimum
+  // iterations check ensures that N >= Step.
+  if (Cost->requiresScalarEpilogue(VF.isVector())) {
+    auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
+    R = Builder.CreateSelect(IsZero, Step, R);
+  }
+
+  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
+
+  return VectorTripCount;
+}
+
+void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
   // Note: The block with the minimum trip-count check is already connected
   // during earlier VPlan construction.
   VPBlockBase *ScalarPH = Plan.getScalarPreheader();
@@ -2311,19 +2354,17 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB, bool
   VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
   VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
   PreVectorPH = CheckVPIRBB;
-  if (!HasAliasMask) {
-      VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
-      PreVectorPH->swapSuccessors();
-
-      // We just connected a new block to the scalar preheader. Update all
-      // VPPhis by adding an incoming value for it, replicating the last value.
-      unsigned NumPredecessors = ScalarPH->getNumPredecessors();
-      for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
-        assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
-        assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
-               "must have incoming values for all operands");
-        R.addOperand(R.getOperand(NumPredecessors - 2));
-      }
+  VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
+  PreVectorPH->swapSuccessors();
+
+  // We just connected a new block to the scalar preheader. Update all
+  // VPPhis by adding an incoming value for it, replicating the last value.
+  unsigned NumPredecessors = ScalarPH->getNumPredecessors();
+  for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
+    assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
+    assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
+           "must have incoming values for all operands");
+    R.addOperand(R.getOperand(NumPredecessors - 2));
   }
 }
 
@@ -6728,7 +6769,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
 
 void LoopVectorizationPlanner::plan(
     ElementCount UserVF, unsigned UserIC,
-    std::optional<ArrayRef<PointerDiffInfo>> RTChecks, bool &HasAliasMask) {
+    std::optional<ArrayRef<PointerDiffInfo>> RTChecks) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
@@ -6775,7 +6816,7 @@ void LoopVectorizationPlanner::plan(
       CM.collectInLoopReductions();
       if (CM.selectUserVectorizationFactor(UserVF)) {
         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-        buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecks, HasAliasMask);
+        buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecks);
         LLVM_DEBUG(printPlans(dbgs()));
         return;
       }
@@ -6800,9 +6841,9 @@ void LoopVectorizationPlanner::plan(
   }
 
   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
-                           DiffChecks, HasAliasMask);
+                           DiffChecks);
   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
-                           DiffChecks, HasAliasMask);
+                           DiffChecks);
 
   LLVM_DEBUG(printPlans(dbgs()));
 }
@@ -8411,9 +8452,9 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
                                       ScaleFactor, Reduction);
 }
 
-void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
-    ElementCount MaxVF, ArrayRef<PointerDiffInfo> RTChecks,
-    bool &HasAliasMask) {
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
+    ElementCount MinVF, ElementCount MaxVF,
+    ArrayRef<PointerDiffInfo> DiffChecks) {
   if (ElementCount::isKnownGT(MinVF, MaxVF))
     return;
 
@@ -8439,7 +8480,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
     if (auto Plan = tryToBuildVPlanWithVPRecipes(
-            std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer, RTChecks, HasAliasMask)) {
+            std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer,
+            DiffChecks)) {
       bool HasScalarVF = Plan->hasScalarVFOnly();
       // Now optimize the initial VPlan.
       if (!HasScalarVF)
@@ -8664,7 +8706,8 @@ static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
 }
 
 VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
-    VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer, ArrayRef<PointerDiffInfo> RTChecks, bool &HasAliasMask) {
+    VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer,
+    ArrayRef<PointerDiffInfo> DiffChecks) {
 
   using namespace llvm::VPlanPatternMatch;
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
@@ -8953,9 +8996,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     bool WithoutRuntimeCheck =
         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
-                                       WithoutRuntimeCheck, PSE, RTChecks);
-    if (ForControlFlow && !RTChecks.empty())
-      HasAliasMask = true;
+                                       WithoutRuntimeCheck, PSE, DiffChecks);
   }
   VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues);
 
@@ -9371,7 +9412,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
             CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
            "Cannot SCEV check stride or overflow when optimizing for size");
     VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
-                                      HasBranchWeights, RTChecks.HasAliasMask);
+                                      HasBranchWeights);
   }
   const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
   if (MemCheckBlock) {
@@ -9396,7 +9437,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
       });
     }
     VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock,
-                                      HasBranchWeights, RTChecks.HasAliasMask);
+                                      HasBranchWeights);
   }
 }
 
@@ -9527,7 +9568,6 @@ static bool processLoopInVPlanNativePath(
   // Mark the loop as already vectorized to avoid vectorizing again.
   Hints.setAlreadyVectorized();
   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
-
   return true;
 }
 
@@ -10142,30 +10182,31 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   ElementCount UserVF = Hints.getWidth();
   unsigned UserIC = Hints.getInterleave();
 
-  GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
-                           CM.CostKind);
-
   // Plan how to best vectorize.
   LVP.plan(UserVF, UserIC,
-           LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(),
-           Checks.HasAliasMask);
+           LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks());
   VectorizationFactor VF = LVP.computeBestVF();
-  if (Checks.HasAliasMask)
-    LoopsAliasMasked++;
   unsigned IC = 1;
 
   if (ORE->allowExtraAnalysis(LV_NAME))
     LVP.emitInvalidCostRemarks(ORE);
 
+  GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
   if (LVP.hasPlanWithVF(VF.Width)) {
     // Select the interleave count.
     IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
 
     unsigned SelectedIC = std::max(IC, UserIC);
-    //  Optimistically generate runtime checks if they are needed. Drop them if
+    // Optimistically generate runtime checks if they are needed. Drop them if
     //  they turn out to not be profitable.
     if (VF.Width.isVector() || SelectedIC > 1) {
-      Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
+      TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
+      bool UseSafeEltsMask =
+          useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle));
+      if (UseSafeEltsMask)
+        LoopsAliasMasked++;
+      Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC,
+                    UseSafeEltsMask);
 
       // Bail out early if either the SCEV or memory runtime checks are known to
       // fail. In that case, the vector loop would never execute.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 39269eaf29add..6835e2e5e3f98 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3208,7 +3208,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   }
 };
 
-// Given a pointer A that is being laoded from, and pointer B that is being
+// Given a pointer A that is being loaded from, and pointer B that is being
 // stored to, both with unknown lengths, create a mask that disables
 // elements which could overlap across a loop iteration. For example, if A
 // is X and B is X + 2 with VF being 4, only the first two elements of the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 4d8080df02930..b231a8429503f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -638,26 +638,24 @@ static constexpr uint32_t CheckBypassWeights[] = {1, 127};
 
 void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
                                        BasicBlock *CheckBlock,
-                                       bool AddBranchWeights, bool HasAliasMask) {
+                                       bool AddBranchWeights) {
+  VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
   VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock);
   VPBlockBase *VectorPH = Plan.getVectorPreheader();
+  VPBlockBase *ScalarPH = Plan.getScalarPreheader();
   VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
   VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckBlockVPBB);
-  VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
-  if (!HasAliasMask) {
-    VPBlockBase *ScalarPH = Plan.getScalarPreheader();
-    VPBlockUtils::connectBlocks(CheckBlockVPBB, ScalarPH);
-    CheckBlockVPBB->swapSuccessors();
-
-    // We just connected a new block to the scalar preheader. Update all
-    // VPPhis by adding an incoming value for it, replicating the last value.
-    unsigned NumPredecessors = ScalarPH->getNumPredecessors();
-    for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
-      assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
-      assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
-             "must have incoming values for all operands");
-      R.addOperand(R.getOperand(NumPredecessors - 2));
-    }
+  VPBlockUtils::connectBlocks(CheckBlockVPBB, ScalarPH);
+  CheckBlockVPBB->swapSuccessors();
+
+  // We just connected a new block to the scalar preheader. Update all
+  // VPPhis by adding an incoming value for it, replicating the last value.
+  unsigned NumPredecessors = ScalarPH->getNumPredecessors();
+  for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
+    assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
+    assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
+           "must have incoming values for all operands");
+    R.addOperand(R.getOperand(NumPredecessors - 2));
   }
 
   VPIRMetadata VPBranchWeights;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b227fcf0e83c8..0ba4744d52f67 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2041,11 +2041,15 @@ static VPValue *addVPLaneMaskPhiAndUpdateExitBranch(
   LaneMaskPhi->insertAfter(CanonicalIVPHI);
   VPValue *LaneMask = LaneMaskPhi;
   if (AliasMask) {
+    auto &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
+    VPBuilder CountBuilder =
+        VPBuilder::getToInsertAfter(AliasMask->getDefiningRecipe());
+    VPValue *PopCount = CountBuilder.createNaryOp(VPInstruction::PopCount,
+                                                  {AliasMask}, DL, "popcount");
     // Increment phi by correct amount.
     Builder.setInsertPoint(CanonicalIVIncrement);
 
-    VPValue *IncrementBy = Builder.createNaryOp(VPInstruction::PopCount,
-                                                {AliasMask}, DL, "popcount");
+    VPValue *IncrementBy = PopCount;
     Type *IVType = CanonicalIVPHI->getScalarType();
 
     if (IVType->getScalarSizeInBits() < 64)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 4e3c2c82ef2fb..e97023e52d912 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -89,9 +89,9 @@ struct VPlanTransforms {
 
   /// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
   /// VPValue and connect the block to \p Plan, using the VPValue as branch
-  /// condition. If an alias mask has been set up then the checkblock won't branch to the scalar preheader.
+  /// condition.
   static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock,
-                               bool AddBranchWeights, bool HasAliasMask);
+                               bool AddBranchWeights);
 
   /// Replaces the VPInstructions in \p Plan with corresponding
   /// widen recipes. Returns false if any VPInstructions could not be converted
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
index 4f1696b7d7e1f..acd6beb303954 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -1,47 +1,25 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --filter-out-after "^scalar.ph:" --version 4
-; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize,instcombine,early-cse -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s
 
 define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-LABEL: define dso_local void @alias_mask(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C]] to i64
-; CHECK-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[C2:%.*]] = ptrtoint ptr [[C]] to i64
 ; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i64 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[C2]], [[B3]]
-; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[B]], ptr [[C]], i64 1)
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP0]])
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP7]], 16
-; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP6]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP5]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[B2]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[C1]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT6]], i32 0
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT]], i32 0
-; CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-; CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP14]] to ptr
-; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP30]], ptr [[TMP31]], i64 1)
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP8]], 16
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[TMP15]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt i64 [[N]], [[TMP15]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i64 [[TMP11]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP8]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP4]])
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
@@ -49,23 +27,16 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP25:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP25]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP25]], <vscale x 16 x i8> poison)
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP25]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP25]], <vscale x 16 x i8> poison)
 ; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD5]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP20]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[TMP25]])
-; CHECK-NEXT:    [[TMP28:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i8>
-; CHECK-NEXT:    [[TMP23:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP28]])
-; CHECK-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP23]] to i64
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP20]], ptr [[TMP21]], i32 1, <vscale x 16 x i1> [[TMP25]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP24]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP13]])
-; CHECK-NEXT:    [[TMP26:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <vscale x 16 x i1> [[TMP26]], i32 0
-; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ;
 entry:
@@ -93,44 +64,21 @@ define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-LABEL: define i32 @alias_mask_read_after_write(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[C2:%.*]] = ptrtoint ptr [[C]] to i64
-; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
-; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[CMP19:%.*]] = icmp sgt i64 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[B2]], [[C3]]
-; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.raw.mask.nxv4i1(ptr [[C]], ptr [[B]], i64 4)
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 4 x i1> [[ALIAS_LANE_MASK]] to <vscale x 4 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP0]])
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP8]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP7]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[C2]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[B1]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT6]], i32 0
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP28]] to ptr
-; CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP15]] to ptr
-; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.raw.mask.nxv4i1(ptr [[TMP34]], ptr [[TMP35]], i64 4)
+; CHECK-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP9]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[N]], [[TMP16]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP16]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP4]])
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
@@ -139,25 +87,18 @@ define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP31:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 2, <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP17]], i32 2, <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP20]], i32 2, <vscale x 4 x i1> [[TMP31]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP19]], i32 2, <vscale x 4 x i1> [[TMP31]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP22]], i32 2, <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP21]], i32 2, <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP23:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 4 x i32> [[TMP23]], [[WIDE_MASKED_LOAD5]]
 ; CHECK-NEXT:    [[TMP25]] = select <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP32:%.*]] = zext <vscale x 4 x i1> [[ALIAS_LANE_MASK]] to <vscale x 4 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP32]])
-; CHECK-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP26]] to i64
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP27]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
-; CHECK-NEXT:    [[TMP29:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 4 x i1> [[TMP29]], i32 0
-; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP13]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll
index d3c7c9c14802c..e4c142c73b8ec 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll
@@ -10,32 +10,32 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
 ; CHECK-NEXT: Live-in vp<%0> = VF
 ; CHECK-NEXT: vp<%3> = original trip-count
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body.preheader>:
 ; CHECK-NEXT:   IR   %wide.trip.count = zext nneg i32 %n to i64
 ; CHECK-NEXT:   EMIT vp<%3> = EXPAND SCEV (zext i32 %n to i64)
 ; CHECK-NEXT:   EMIT vp<%4> = EXPAND SCEV (ptrtoint ptr %c to i64)
 ; CHECK-NEXT:   EMIT vp<%5> = EXPAND SCEV (ptrtoint ptr %b to i64)
 ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT:   EMIT vp<%6> = ALIAS-LANE-MASK vp<%5>, vp<%4> (write-after-read)
 ; CHECK-NEXT:   EMIT vp<%index.part.next> = VF * Part + ir<0>
 ; CHECK-NEXT:   EMIT vp<%active.lane.mask.entry> = active lane mask vp<%index.part.next>, vp<%3>
 ; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT:   vector.body:
 ; CHECK-NEXT:     EMIT vp<%7> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
 ; CHECK-NEXT:     ACTIVE-LANE-MASK-PHI vp<%8> = phi vp<%active.lane.mask.entry>, vp<%active.lane.mask.next>
 ; CHECK-NEXT:     EMIT vp<%9> = and vp<%8>, vp<%6>
 ; CHECK-NEXT:   Successor(s): pred.store
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT:   <xVFxUF> pred.store: {
 ; CHECK-NEXT:     pred.store.entry:
 ; CHECK-NEXT:       BRANCH-ON-MASK vp<%9>
 ; CHECK-NEXT:     Successor(s): pred.store.if, pred.store.continue
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT:     pred.store.if:
 ; CHECK-NEXT:       vp<%10> = SCALAR-STEPS vp<%7>, ir<1>, vp<%0>
 ; CHECK-NEXT:       REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%10>
@@ -46,12 +46,12 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NEXT:       REPLICATE ir<%add> = add ir<%1>, ir<%0>
 ; CHECK-NEXT:       REPLICATE store ir<%add>, ir<%arrayidx6>
 ; CHECK-NEXT:     Successor(s): pred.store.continue
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT:     pred.store.continue:
 ; CHECK-NEXT:     No successors
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Successor(s): for.body.2
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT:   for.body.2:
 ; CHECK-NEXT:     EMIT vp<%popcount> = popcount vp<%6>
 ; CHECK-NEXT:     EMIT vp<%index.next> = add vp<%7>, vp<%popcount>
@@ -62,10 +62,10 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
 ; CHECK-NEXT: Successor(s): middle.block
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:

>From 8f077f22de1fe1ba1771ad45cdc3b91a9add7e85 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 6 Aug 2025 17:04:52 +0100
Subject: [PATCH 03/13] Add target hook

---
 .../llvm/Analysis/TargetTransformInfo.h       |  5 ++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  2 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  4 ++
 .../AArch64/AArch64TargetTransformInfo.cpp    |  5 ++
 .../AArch64/AArch64TargetTransformInfo.h      |  2 +
 .../Transforms/Vectorize/LoopVectorize.cpp    | 60 ++-----------------
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  3 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  5 +-
 .../AArch64/induction-costs-sve.ll            | 26 ++------
 .../vplan-printing-predicated.ll              | 52 +++++++---------
 10 files changed, 53 insertions(+), 111 deletions(-)
 rename llvm/test/Transforms/LoopVectorize/{ => AArch64}/vplan-printing-predicated.ll (68%)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 77d74ce75c47b..454e8a182252e 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1354,6 +1354,11 @@ class TargetTransformInfo {
       PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
       TTI::TargetCostKind CostKind) const;
 
+  /// \return true if a mask should be formed that disables lanes that could
+  /// alias between two pointers. The mask is created by the
+  /// loop_dependence_{war,raw}_mask intrinsics.
+  LLVM_ABI bool useSafeEltsMask(ElementCount VF) const;
+
   /// \return The maximum interleave factor that any transform should try to
   /// perform for this target. This number depends on the level of parallelism
   /// and the number of execution units in the CPU.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index e879712121b59..d067d1bf95d85 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -660,6 +660,8 @@ class TargetTransformInfoImplBase {
     return InstructionCost::getInvalid();
   }
 
+  virtual bool useSafeEltsMask(ElementCount VF) const { return false; }
+
   virtual unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
 
   virtual InstructionCost getArithmeticInstrCost(
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 4f04209cf4cfc..62ecca66cb09e 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -878,6 +878,10 @@ InstructionCost TargetTransformInfo::getPartialReductionCost(
                                           BinOp, CostKind);
 }
 
+bool TargetTransformInfo::useSafeEltsMask(ElementCount VF) const {
+  return TTIImpl->useSafeEltsMask(VF);
+}
+
 unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
   return TTIImpl->getMaxInterleaveFactor(VF);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0b6f6ef447c77..b97d1da957b7b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5581,6 +5581,11 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
   return Cost;
 }
 
+bool AArch64TTIImpl::useSafeEltsMask(ElementCount VF) const {
+  // The whilewr/rw instructions require SVE2
+  return ST->hasSVE2();
+}
+
 InstructionCost
 AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
                                VectorType *SrcTy, ArrayRef<int> Mask,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 647b242d74fb3..707605b1b6c52 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -387,6 +387,8 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
       TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
       TTI::TargetCostKind CostKind) const override;
 
+  bool useSafeEltsMask(ElementCount VF) const override;
+
   bool enableOrderedReductions() const override { return true; }
 
   InstructionCost getInterleavedMemoryOpCost(
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 806ce5cba07f9..df7c549d64f7b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2081,8 +2081,10 @@ static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
 }
 
-static bool useSafeEltsMask(TailFoldingStyle TFStyle, RTCheckStyle Style) {
-  return useActiveLaneMask(TFStyle) && Style == RTCheckStyle::UseSafeEltsMask;
+static bool useSafeEltsMask(TailFoldingStyle TFStyle, RTCheckStyle Style,
+                            ElementCount VF, const TargetTransformInfo &TTI) {
+  return useActiveLaneMask(TFStyle) && Style == RTCheckStyle::UseSafeEltsMask &&
+         TTI.useSafeEltsMask(VF);
 }
 
 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
@@ -2294,56 +2296,6 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
   return TTI.enableMaskedInterleavedAccessVectorization();
 }
 
-Value *
-InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
-  if (VectorTripCount)
-    return VectorTripCount;
-
-  Value *TC = getTripCount();
-  IRBuilder<> Builder(InsertBlock->getTerminator());
-
-  Type *Ty = TC->getType();
-  // This is where we can make the step a runtime constant.
-  Value *Step = createStepForVF(Builder, Ty, VF, UF);
-
-  // If the tail is to be folded by masking, round the number of iterations N
-  // up to a multiple of Step instead of rounding down. This is done by first
-  // adding Step-1 and then rounding down. Note that it's ok if this addition
-  // overflows: the vector induction variable will eventually wrap to zero given
-  // that it starts at zero and its Step is a power of two; the loop will then
-  // exit, with the last early-exit vector comparison also producing all-true.
-  // For scalable vectors the VF is not guaranteed to be a power of 2, but this
-  // is accounted for in emitIterationCountCheck that adds an overflow check.
-  if (Cost->foldTailByMasking()) {
-    assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
-           "VF*UF must be a power of 2 when folding tail by masking");
-    TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
-                           "n.rnd.up");
-  }
-
-  // Now we need to generate the expression for the part of the loop that the
-  // vectorized body will execute. This is equal to N - (N % Step) if scalar
-  // iterations are not required for correctness, or N - Step, otherwise. Step
-  // is equal to the vectorization factor (number of SIMD elements) times the
-  // unroll factor (number of SIMD instructions).
-  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
-
-  // There are cases where we *must* run at least one iteration in the remainder
-  // loop.  See the cost model for when this can happen.  If the step evenly
-  // divides the trip count, we set the remainder to be equal to the step. If
-  // the step does not evenly divide the trip count, no adjustment is necessary
-  // since there will already be scalar iterations. Note that the minimum
-  // iterations check ensures that N >= Step.
-  if (Cost->requiresScalarEpilogue(VF.isVector())) {
-    auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
-    R = Builder.CreateSelect(IsZero, Step, R);
-  }
-
-  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
-
-  return VectorTripCount;
-}
-
 void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
   // Note: The block with the minimum trip-count check is already connected
   // during earlier VPlan construction.
@@ -6781,7 +6733,7 @@ void LoopVectorizationPlanner::plan(
   ArrayRef<PointerDiffInfo> DiffChecks;
   auto TFStyle = CM.getTailFoldingStyle();
   if (RTChecks.has_value() &&
-      useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle)))
+      useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle), UserVF, TTI))
     DiffChecks = *RTChecks;
 
   // Invalidate interleave groups if all blocks of loop will be predicated.
@@ -10202,7 +10154,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     if (VF.Width.isVector() || SelectedIC > 1) {
       TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
       bool UseSafeEltsMask =
-          useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle));
+          useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle), VF.Width, *TTI);
       if (UseSafeEltsMask)
         LoopsAliasMasked++;
       Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 0861a34ac3e42..1ad2265a6268f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -310,7 +310,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
           })
           .Case<VPExpressionRecipe>([this](const auto *R) {
             return inferScalarType(R->getOperandOfResultType());
-          }).Case<VPAliasLaneMaskRecipe>([this](const VPAliasLaneMaskRecipe *R) {
+          })
+          .Case<VPAliasLaneMaskRecipe>([this](const VPAliasLaneMaskRecipe *R) {
             return Type::getInt1Ty(Ctx);
           });
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2801297fc0bbb..49b1e15c4aeff 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3779,9 +3779,8 @@ void VPAliasLaneMaskRecipe::execute(VPTransformState &State) {
   Value *SinkValue = State.get(getSinkValue(), true);
   Value *SourceValue = State.get(getSourceValue(), true);
 
-  unsigned IntrinsicID = WriteAfterRead
-                             ? Intrinsic::loop_dependence_war_mask
-                             : Intrinsic::loop_dependence_raw_mask;
+  unsigned IntrinsicID = WriteAfterRead ? Intrinsic::loop_dependence_war_mask
+                                        : Intrinsic::loop_dependence_raw_mask;
   Value *SourceAsPtr = Builder.CreateCast(Instruction::IntToPtr, SourceValue,
                                           Builder.getPtrTy());
   Value *SinkAsPtr =
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 77eeac18e9b1d..401914a08ab68 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -90,8 +90,6 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-LABEL: define void @iv_casts(
 ; PRED-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; PRED-NEXT:  [[ENTRY:.*]]:
-; PRED-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
-; PRED-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
 ; PRED-NEXT:    [[SRC3:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; PRED-NEXT:    [[DST2:%.*]] = ptrtoint ptr [[DST]] to i64
 ; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
@@ -101,7 +99,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16
 ; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST2]], [[SRC3]]
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
-; PRED-NEXT:    br label %[[VECTOR_PH:.*]]
+; PRED-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
 ; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
@@ -111,17 +109,8 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[SRC2]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[DST1]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[X]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PRED-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT6]], i32 0
-; PRED-NEXT:    [[TMP33:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT1]], i32 0
-; PRED-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
-; PRED-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP25]] to ptr
-; PRED-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP34]], ptr [[TMP35]], i64 1)
 ; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
@@ -132,8 +121,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; PRED:       [[VECTOR_BODY]]:
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK1]], [[ALIAS_LANE_MASK]]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
 ; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; PRED-NEXT:    [[TMP17:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
@@ -143,12 +131,8 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP21:%.*]] = lshr <vscale x 16 x i16> [[TMP20]], splat (i16 1)
 ; PRED-NEXT:    [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP21]] to <vscale x 16 x i8>
 ; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; PRED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
-; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP27]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT:    [[TMP30:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i8>
-; PRED-NEXT:    [[TMP31:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP30]])
-; PRED-NEXT:    [[TMP32:%.*]] = zext i8 [[TMP31]] to i64
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP32]]
+; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP26]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]])
 ; PRED-NEXT:    [[TMP28:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
 ; PRED-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 16 x i1> [[TMP28]], i32 0
@@ -156,7 +140,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
 ; PRED:       [[SCALAR_PH]]:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; PRED-NEXT:    br label %[[LOOP:.*]]
 ; PRED:       [[LOOP]]:
 ; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing-predicated.ll
similarity index 68%
rename from llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll
rename to llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing-predicated.ll
index e4c142c73b8ec..8887ef25437a4 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing-predicated.ll
@@ -1,7 +1,8 @@
 ; REQUIRES: asserts
-; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-tail-folding-style=data-and-control -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -mattr=+sve2 -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-tail-folding-style=data-and-control -disable-output %s 2>&1 | FileCheck %s
 
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
 
 ; Tests for printing predicated VPlans.
 
@@ -20,6 +21,7 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT:   EMIT vp<%6> = ALIAS-LANE-MASK vp<%5>, vp<%4> (write-after-read)
+; CHECK-NEXT:   EMIT vp<%popcount> = popcount vp<%6>
 ; CHECK-NEXT:   EMIT vp<%index.part.next> = VF * Part + ir<0>
 ; CHECK-NEXT:   EMIT vp<%active.lane.mask.entry> = active lane mask vp<%index.part.next>, vp<%3>
 ; CHECK-NEXT: Successor(s): vector loop
@@ -28,37 +30,23 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NEXT:   vector.body:
 ; CHECK-NEXT:     EMIT vp<%7> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
 ; CHECK-NEXT:     ACTIVE-LANE-MASK-PHI vp<%8> = phi vp<%active.lane.mask.entry>, vp<%active.lane.mask.next>
-; CHECK-NEXT:     EMIT vp<%9> = and vp<%8>, vp<%6>
-; CHECK-NEXT:   Successor(s): pred.store
-; CHECK-EMPTY:
-; CHECK-NEXT:   <xVFxUF> pred.store: {
-; CHECK-NEXT:     pred.store.entry:
-; CHECK-NEXT:       BRANCH-ON-MASK vp<%9>
-; CHECK-NEXT:     Successor(s): pred.store.if, pred.store.continue
-; CHECK-EMPTY:
-; CHECK-NEXT:     pred.store.if:
-; CHECK-NEXT:       vp<%10> = SCALAR-STEPS vp<%7>, ir<1>, vp<%0>
-; CHECK-NEXT:       REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%10>
-; CHECK-NEXT:       REPLICATE ir<%0> = load ir<%arrayidx>
-; CHECK-NEXT:       REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<%10>
-; CHECK-NEXT:       REPLICATE ir<%1> = load ir<%arrayidx2>
-; CHECK-NEXT:       REPLICATE ir<%arrayidx6> = getelementptr inbounds ir<%c>, vp<%10>
-; CHECK-NEXT:       REPLICATE ir<%add> = add ir<%1>, ir<%0>
-; CHECK-NEXT:       REPLICATE store ir<%add>, ir<%arrayidx6>
-; CHECK-NEXT:     Successor(s): pred.store.continue
-; CHECK-EMPTY:
-; CHECK-NEXT:     pred.store.continue:
-; CHECK-NEXT:     No successors
-; CHECK-NEXT:   }
-; CHECK-NEXT:   Successor(s): for.body.2
-; CHECK-EMPTY:
-; CHECK-NEXT:   for.body.2:
-; CHECK-NEXT:     EMIT vp<%popcount> = popcount vp<%6>
+; CHECK-NEXT:     vp<%9> = SCALAR-STEPS vp<%7>, ir<1>, vp<%0>
+; CHECK-NEXT:     EMIT vp<%10> = and vp<%8>, vp<%6>
+; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%9>
+; CHECK-NEXT:     vp<%11> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN ir<%0> = load vp<%11>, vp<%10>
+; CHECK-NEXT:     CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<%9>
+; CHECK-NEXT:     vp<%12> = vector-pointer ir<%arrayidx2>
+; CHECK-NEXT:     WIDEN ir<%1> = load vp<%12>, vp<%10>
+; CHECK-NEXT:     WIDEN ir<%add> = add ir<%1>, ir<%0>
+; CHECK-NEXT:     CLONE ir<%arrayidx6> = getelementptr inbounds ir<%c>, vp<%9>
+; CHECK-NEXT:     vp<%13> = vector-pointer ir<%arrayidx6>
+; CHECK-NEXT:     WIDEN store vp<%13>, ir<%add>, vp<%10>
 ; CHECK-NEXT:     EMIT vp<%index.next> = add vp<%7>, vp<%popcount>
-; CHECK-NEXT:     EMIT vp<%11> = VF * Part + vp<%index.next>
-; CHECK-NEXT:     EMIT vp<%active.lane.mask.next> = active lane mask vp<%11>, vp<%3>
-; CHECK-NEXT:     EMIT vp<%12> = not vp<%active.lane.mask.next>
-; CHECK-NEXT:     EMIT branch-on-cond vp<%12>
+; CHECK-NEXT:     EMIT vp<%14> = VF * Part + vp<%index.next>
+; CHECK-NEXT:     EMIT vp<%active.lane.mask.next> = active lane mask vp<%14>, vp<%3>
+; CHECK-NEXT:     EMIT vp<%15> = not vp<%active.lane.mask.next>
+; CHECK-NEXT:     EMIT branch-on-cond vp<%15>
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
 ; CHECK-NEXT: Successor(s): middle.block

>From 91390e6b18621b05834d355a0a4878a349a23229 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 13 Aug 2025 16:23:51 +0100
Subject: [PATCH 04/13] Add computeCost

---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  3 ++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  9 +++++
 .../AArch64/induction-costs-sve.ll            | 37 +++++--------------
 3 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 6835e2e5e3f98..1d874f85e0ac1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3247,6 +3247,9 @@ class VPAliasLaneMaskRecipe : public VPSingleDefRecipe {
 
   bool isWriteAfterRead() const { return WriteAfterRead; }
 
+InstructionCost computeCost(ElementCount VF,
+                                             VPCostContext &Ctx) const override;
+
 private:
   unsigned ElementSize;
   bool WriteAfterRead;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 49b1e15c4aeff..a6aec2a63efeb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3792,6 +3792,15 @@ void VPAliasLaneMaskRecipe::execute(VPTransformState &State) {
   State.set(this, AliasMask, /*IsScalar=*/false);
 }
 
+InstructionCost
+VPAliasLaneMaskRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const {
+  Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
+  Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
+  IntrinsicCostAttributes Attrs(isWriteAfterRead() ? Intrinsic::loop_dependence_war_mask : Intrinsic::loop_dependence_raw_mask, RetTy,
+                                {ArgTy, ArgTy});
+  return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPAliasLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 401914a08ab68..6e0a951ae3a43 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -28,8 +28,6 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
 ; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP10]]
 ; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; DEFAULT-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[X]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[TMP13:%.*]] = trunc <vscale x 8 x i32> [[BROADCAST_SPLAT]] to <vscale x 8 x i16>
@@ -60,7 +58,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP38]], i64 [[TMP42]]
 ; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP36]], ptr [[TMP38]], align 1
 ; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP37]], ptr [[TMP43]], align 1
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
 ; DEFAULT-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
@@ -101,12 +99,6 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
 ; PRED-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
-; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
-; PRED-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP5]], 1
-; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP8]]
-; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
-; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[X]], i64 0
@@ -134,9 +126,9 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP26]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP28:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; PRED-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 16 x i1> [[TMP28]], i32 0
-; PRED-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PRED-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; PRED-NEXT:    [[TMP27:%.*]] = xor i1 [[TMP25]], true
+; PRED-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
 ; PRED:       [[SCALAR_PH]]:
@@ -270,9 +262,6 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
 ; PRED-NEXT:    br i1 [[TMP12]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
-; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1
-; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
-; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 2
 ; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 2
 ; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
@@ -304,9 +293,9 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED:       [[PRED_STORE_CONTINUE2]]:
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP24:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; PRED-NEXT:    [[TMP25:%.*]] = xor i1 [[TMP24]], true
 ; PRED-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
-; PRED-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[TMP24]], i32 0
 ; PRED-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
@@ -441,9 +430,6 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
 ; PRED-NEXT:    br i1 [[TMP13]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
-; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
-; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
-; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP14:%.*]] = sub i64 [[TMP0]], 4
 ; PRED-NEXT:    [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4
 ; PRED-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
@@ -496,9 +482,9 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED:       [[PRED_STORE_CONTINUE7]]:
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]])
-; PRED-NEXT:    [[TMP35:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; PRED-NEXT:    [[TMP36:%.*]] = xor i1 [[TMP35]], true
 ; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; PRED-NEXT:    [[TMP36:%.*]] = extractelement <4 x i1> [[TMP35]], i32 0
 ; PRED-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
@@ -635,9 +621,6 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
 ; PRED-NEXT:    br i1 [[TMP12]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
-; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
-; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
-; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 4
 ; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 4
 ; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
@@ -690,9 +673,9 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED:       [[PRED_STORE_CONTINUE6]]:
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP34:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; PRED-NEXT:    [[TMP35:%.*]] = xor i1 [[TMP34]], true
 ; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; PRED-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP34]], i32 0
 ; PRED-NEXT:    br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]

>From 101218b463b08ca5b0c3ff172b6aa6f453c0bb28 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Aug 2025 15:28:52 +0100
Subject: [PATCH 05/13] Remove VF from useSafeEltsMask

---
 llvm/include/llvm/Analysis/TargetTransformInfo.h       | 2 +-
 llvm/include/llvm/Analysis/TargetTransformInfoImpl.h   | 2 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp              | 4 ++--
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 2 +-
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h   | 2 +-
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp        | 8 ++++----
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 454e8a182252e..f2b0c6c332169 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1357,7 +1357,7 @@ class TargetTransformInfo {
   /// \return true if a mask should be formed that disables lanes that could
   /// alias between two pointers. The mask is created by the
   /// loop_dependence_{war,raw}_mask intrinsics.
-  LLVM_ABI bool useSafeEltsMask(ElementCount VF) const;
+  LLVM_ABI bool useSafeEltsMask() const;
 
   /// \return The maximum interleave factor that any transform should try to
   /// perform for this target. This number depends on the level of parallelism
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index d067d1bf95d85..437af391340a6 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -660,7 +660,7 @@ class TargetTransformInfoImplBase {
     return InstructionCost::getInvalid();
   }
 
-  virtual bool useSafeEltsMask(ElementCount VF) const { return false; }
+  virtual bool useSafeEltsMask() const { return false; }
 
   virtual unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
 
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 62ecca66cb09e..9efac5d3a027b 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -878,8 +878,8 @@ InstructionCost TargetTransformInfo::getPartialReductionCost(
                                           BinOp, CostKind);
 }
 
-bool TargetTransformInfo::useSafeEltsMask(ElementCount VF) const {
-  return TTIImpl->useSafeEltsMask(VF);
+bool TargetTransformInfo::useSafeEltsMask() const {
+  return TTIImpl->useSafeEltsMask();
 }
 
 unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b97d1da957b7b..67e04b6d9e97d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5581,7 +5581,7 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
   return Cost;
 }
 
-bool AArch64TTIImpl::useSafeEltsMask(ElementCount VF) const {
+bool AArch64TTIImpl::useSafeEltsMask() const {
   // The whilewr/rw instructions require SVE2
   return ST->hasSVE2();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 707605b1b6c52..bf1a36805ff58 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -387,7 +387,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
       TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
       TTI::TargetCostKind CostKind) const override;
 
-  bool useSafeEltsMask(ElementCount VF) const override;
+  bool useSafeEltsMask() const override;
 
   bool enableOrderedReductions() const override { return true; }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index df7c549d64f7b..3e39581b681c2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2082,9 +2082,9 @@ static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
 }
 
 static bool useSafeEltsMask(TailFoldingStyle TFStyle, RTCheckStyle Style,
-                            ElementCount VF, const TargetTransformInfo &TTI) {
+                            const TargetTransformInfo &TTI) {
   return useActiveLaneMask(TFStyle) && Style == RTCheckStyle::UseSafeEltsMask &&
-         TTI.useSafeEltsMask(VF);
+         TTI.useSafeEltsMask();
 }
 
 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
@@ -6733,7 +6733,7 @@ void LoopVectorizationPlanner::plan(
   ArrayRef<PointerDiffInfo> DiffChecks;
   auto TFStyle = CM.getTailFoldingStyle();
   if (RTChecks.has_value() &&
-      useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle), UserVF, TTI))
+      useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle), TTI))
     DiffChecks = *RTChecks;
 
   // Invalidate interleave groups if all blocks of loop will be predicated.
@@ -10154,7 +10154,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     if (VF.Width.isVector() || SelectedIC > 1) {
       TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
       bool UseSafeEltsMask =
-          useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle), VF.Width, *TTI);
+          useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle), *TTI);
       if (UseSafeEltsMask)
         LoopsAliasMasked++;
       Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC,

>From 0ae1e06d434c0e956179d3220ebafae288f39027 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Aug 2025 15:29:22 +0100
Subject: [PATCH 06/13] Calculate expanded cost and allow the intrinsic for SME

---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      | 47 +++++++++++++++++++
 .../AArch64/AArch64TargetTransformInfo.cpp    |  4 +-
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 1216433fec594..f74da0f271bd7 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2141,6 +2141,53 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       // Otherwise, fallback to default scalarization cost.
       break;
     }
+    case Intrinsic::loop_dependence_raw_mask:
+    case Intrinsic::loop_dependence_war_mask: {
+      InstructionCost Cost = 0;
+      Type *PtrTy = ICA.getArgTypes()[0];
+      bool IsReadAfterWrite = IID == Intrinsic::loop_dependence_raw_mask;
+
+      Cost +=
+          thisT()->getArithmeticInstrCost(Instruction::Sub, PtrTy, CostKind);
+      if (IsReadAfterWrite) {
+        IntrinsicCostAttributes AbsAttrs(Intrinsic::abs, PtrTy, {PtrTy}, {});
+        Cost += thisT()->getIntrinsicInstrCost(AbsAttrs, CostKind);
+      }
+
+      Cost +=
+          thisT()->getArithmeticInstrCost(Instruction::SDiv, PtrTy, CostKind);
+      Type *CmpTy =
+          getTLI()
+              ->getSetCCResultType(
+                  thisT()->getDataLayout(), RetTy->getContext(),
+                  getTLI()->getValueType(thisT()->getDataLayout(), PtrTy))
+              .getTypeForEVT(RetTy->getContext());
+      Cost += thisT()->getCmpSelInstrCost(
+          BinaryOperator::ICmp, CmpTy, PtrTy,
+          IsReadAfterWrite ? CmpInst::ICMP_EQ : CmpInst::ICMP_SLE, CostKind);
+
+      // The deconstructed active lane mask
+      VectorType *RetTyVec = cast<VectorType>(RetTy);
+      VectorType *SplatTy = cast<VectorType>(RetTyVec->getWithNewType(PtrTy));
+      Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SplatTy, SplatTy, {},
+                                      CostKind, 0, nullptr);
+      IntrinsicCostAttributes StepVecAttrs(Intrinsic::stepvector, SplatTy, {},
+                                           FMF);
+      Cost += thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SplatTy,
+                                          SplatTy, CmpInst::ICMP_ULT, CostKind);
+
+      Cost +=
+          thisT()->getCastInstrCost(Instruction::CastOps::ZExt, RetTy, SplatTy,
+                                    TTI::CastContextHint::None, CostKind);
+      Cost += thisT()->getCastInstrCost(Instruction::CastOps::ZExt,
+                                        RetTyVec->getElementType(), CmpTy,
+                                        TTI::CastContextHint::None, CostKind);
+      Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, RetTyVec, RetTyVec, {},
+                                      CostKind, 0, nullptr);
+      Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
+      return Cost;
+    }
     }
 
     // Assume that we need to scalarize this intrinsic.)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 67e04b6d9e97d..654efcac32bb0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -991,9 +991,9 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   }
   case Intrinsic::loop_dependence_raw_mask:
   case Intrinsic::loop_dependence_war_mask:
-    if (ST->hasSVE2())
+    if (ST->hasSVE2() || ST->hasSME())
       return 1;
-    return InstructionCost::getInvalid(CostKind);
+    break;
   default:
     break;
   }

>From b99c777c12aeafb0f3329de2c1089b10f7c62f85 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Aug 2025 15:33:04 +0100
Subject: [PATCH 07/13] Improve WriteAfterRead assignment

---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 20ac57fe7d889..fa3a496a8c233 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -482,7 +482,7 @@ bool RuntimePointerChecking::tryToCreateDiffCheck(
     }
   }
 
-  bool WriteAfterRead = isa<LoadInst>(SrcInsts[0]);
+  bool WriteAfterRead = !Src->IsWritePtr && Sink->IsWritePtr;
 
   LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
                     << "SrcStart: " << *SrcStartInt << '\n'

>From b48c391f9a4e8c77e384848bbb05031f1664f79c Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Aug 2025 15:36:23 +0100
Subject: [PATCH 08/13] whilewr/rw accept sme as well

---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 654efcac32bb0..3c9ca84c566fb 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -991,6 +991,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   }
   case Intrinsic::loop_dependence_raw_mask:
   case Intrinsic::loop_dependence_war_mask:
+    // The whilewr/rw instructions require SVE2
     if (ST->hasSVE2() || ST->hasSME())
       return 1;
     break;
@@ -5583,7 +5584,7 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
 
 bool AArch64TTIImpl::useSafeEltsMask() const {
   // The whilewr/rw instructions require SVE2
-  return ST->hasSVE2();
+  return ST->hasSVE2() || ST->hasSME();
 }
 
 InstructionCost

>From 4206436b8a4e2a4a7df7b97759e232a005cba4ca Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Aug 2025 16:26:15 +0100
Subject: [PATCH 09/13] Add addSafeEltsRuntimeChecks

---
 .../include/llvm/Transforms/Utils/LoopUtils.h |  13 +-
 llvm/lib/Transforms/Utils/LoopUtils.cpp       | 130 ++++++++++--------
 .../Transforms/Vectorize/LoopVectorize.cpp    |   7 +-
 3 files changed, 86 insertions(+), 64 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index e9515c32b0603..a1172ba2557dd 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -569,11 +569,14 @@ addRuntimeChecks(Instruction *Loc, Loop *TheLoop,
                  const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,
                  SCEVExpander &Expander, bool HoistRuntimeChecks = false);
 
-LLVM_ABI Value *
-addDiffRuntimeChecks(Instruction *Loc, ArrayRef<PointerDiffInfo> Checks,
-                     SCEVExpander &Expander,
-                     function_ref<Value *(IRBuilderBase &, unsigned)> GetVF,
-                     unsigned IC, ElementCount VF, bool UseSafeEltsMask);
+LLVM_ABI Value *addSafeEltsRuntimeChecks(Instruction *Loc,
+                                         ArrayRef<PointerDiffInfo> Checks,
+                                         SCEVExpander &Expander,
+                                         ElementCount VF);
+
+LLVM_ABI Value *addDiffRuntimeChecks(
+    Instruction *Loc, ArrayRef<PointerDiffInfo> Checks, SCEVExpander &Expander,
+    function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC);
 
 /// Struct to hold information about a partially invariant condition.
 struct IVConditionInfo {
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 248f6fa7d7f5e..db150d09d26f6 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -2020,10 +2020,61 @@ Value *llvm::addRuntimeChecks(
   return MemoryRuntimeCheck;
 }
 
+Value *llvm::addSafeEltsRuntimeChecks(Instruction *Loc,
+                                      ArrayRef<PointerDiffInfo> Checks,
+                                      SCEVExpander &Expander, ElementCount VF) {
+  IRBuilder ChkBuilder(Loc->getContext(),
+                       InstSimplifyFolder(Loc->getDataLayout()));
+  ChkBuilder.SetInsertPoint(Loc);
+  Value *MemoryRuntimeCheck = nullptr;
+
+  // Map to keep track of created compares, The key is the pair of operands for
+  // the compare, to allow detecting and re-using redundant compares.
+  DenseMap<std::pair<Value *, Value *>, Value *> SeenCompares;
+  Value *AliasLaneMask = nullptr;
+  for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze,
+                    WriteAfterRead] : Checks) {
+    Type *Ty = SinkStart->getType();
+    Value *Sink = Expander.expandCodeFor(SinkStart, Ty, Loc);
+    Value *Src = Expander.expandCodeFor(SrcStart, Ty, Loc);
+    if (SeenCompares.lookup({Sink, Src}))
+      continue;
+
+    unsigned IntOpc = WriteAfterRead ? Intrinsic::loop_dependence_war_mask
+                                     : Intrinsic::loop_dependence_raw_mask;
+    Value *SourceAsPtr = ChkBuilder.CreateCast(Instruction::IntToPtr, Src,
+                                               ChkBuilder.getPtrTy());
+    Value *SinkAsPtr = ChkBuilder.CreateCast(Instruction::IntToPtr, Sink,
+                                             ChkBuilder.getPtrTy());
+    Value *M = ChkBuilder.CreateIntrinsic(
+        IntOpc, {VectorType::get(ChkBuilder.getInt1Ty(), VF)},
+        {SourceAsPtr, SinkAsPtr, ChkBuilder.getInt64(AccessSize)}, nullptr,
+        "alias.lane.mask");
+    SeenCompares.insert({{Sink, Src}, M});
+    if (AliasLaneMask)
+      M = ChkBuilder.CreateAnd(AliasLaneMask, M);
+    else
+      AliasLaneMask = M;
+  }
+  assert(AliasLaneMask && "Expected an alias lane mask to have been created.");
+  auto *VecVT = VectorType::get(ChkBuilder.getInt1Ty(), VF);
+  // Extend to an i8 since i1 is too small to add with
+  Value *PopCount = ChkBuilder.CreateCast(
+      Instruction::ZExt, AliasLaneMask,
+      VectorType::get(ChkBuilder.getInt8Ty(), VecVT->getElementCount()));
+
+  PopCount =
+      ChkBuilder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, PopCount);
+  PopCount = ChkBuilder.CreateCast(Instruction::ZExt, PopCount,
+                                   ChkBuilder.getInt64Ty());
+  MemoryRuntimeCheck = ChkBuilder.CreateICmpUGT(
+      PopCount, ConstantInt::get(ChkBuilder.getInt64Ty(), 0));
+  return MemoryRuntimeCheck;
+}
+
 Value *llvm::addDiffRuntimeChecks(
     Instruction *Loc, ArrayRef<PointerDiffInfo> Checks, SCEVExpander &Expander,
-    function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC,
-    ElementCount VF, bool UseSafeEltsMask) {
+    function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC) {
 
   LLVMContext &Ctx = Loc->getContext();
   IRBuilder ChkBuilder(Ctx, InstSimplifyFolder(Loc->getDataLayout()));
@@ -2035,68 +2086,33 @@ Value *llvm::addDiffRuntimeChecks(
   // Map to keep track of created compares, The key is the pair of operands for
   // the compare, to allow detecting and re-using redundant compares.
   DenseMap<std::pair<Value *, Value *>, Value *> SeenCompares;
-  Value *AliasLaneMask = nullptr;
   for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze,
                     WriteAfterRead] : Checks) {
     Type *Ty = SinkStart->getType();
-    if (!VF.isScalar() && UseSafeEltsMask) {
-      Value *Sink = Expander.expandCodeFor(SinkStart, Ty, Loc);
-      Value *Src = Expander.expandCodeFor(SrcStart, Ty, Loc);
-      unsigned IntOpc = WriteAfterRead ? Intrinsic::loop_dependence_war_mask
-                                       : Intrinsic::loop_dependence_raw_mask;
-      Value *SourceAsPtr = ChkBuilder.CreateCast(Instruction::IntToPtr, Src,
-                                                 ChkBuilder.getPtrTy());
-      Value *SinkAsPtr = ChkBuilder.CreateCast(Instruction::IntToPtr, Sink,
-                                               ChkBuilder.getPtrTy());
-      Value *M = ChkBuilder.CreateIntrinsic(
-          IntOpc, {VectorType::get(ChkBuilder.getInt1Ty(), VF)},
-          {SourceAsPtr, SinkAsPtr, ChkBuilder.getInt64(AccessSize)}, nullptr,
-          "alias.lane.mask");
-      if (AliasLaneMask)
-        M = ChkBuilder.CreateAnd(AliasLaneMask, M);
-      else
-        AliasLaneMask = M;
-    } else {
-      // Compute VF * IC * AccessSize.
-      auto *VFTimesICTimesSize =
-          ChkBuilder.CreateMul(GetVF(ChkBuilder, Ty->getScalarSizeInBits()),
-                               ConstantInt::get(Ty, IC * AccessSize));
-      Value *Diff =
-          Expander.expandCodeFor(SE.getMinusSCEV(SinkStart, SrcStart), Ty, Loc);
-
-      // Check if the same compare has already been created earlier. In that
-      // case, there is no need to check it again.
-      Value *IsConflict = SeenCompares.lookup({Diff, VFTimesICTimesSize});
-      if (IsConflict)
-        continue;
+    // Compute VF * IC * AccessSize.
+    auto *VFTimesICTimesSize =
+        ChkBuilder.CreateMul(GetVF(ChkBuilder, Ty->getScalarSizeInBits()),
+                             ConstantInt::get(Ty, IC * AccessSize));
+    Value *Diff =
+        Expander.expandCodeFor(SE.getMinusSCEV(SinkStart, SrcStart), Ty, Loc);
+
+    // Check if the same compare has already been created earlier. In that
+    // case, there is no need to check it again.
+    Value *IsConflict = SeenCompares.lookup({Diff, VFTimesICTimesSize});
+    if (IsConflict)
+      continue;
 
+    IsConflict =
+        ChkBuilder.CreateICmpULT(Diff, VFTimesICTimesSize, "diff.check");
+    SeenCompares.insert({{Diff, VFTimesICTimesSize}, IsConflict});
+    if (NeedsFreeze)
+      IsConflict =
+          ChkBuilder.CreateFreeze(IsConflict, IsConflict->getName() + ".fr");
+    if (MemoryRuntimeCheck) {
       IsConflict =
-          ChkBuilder.CreateICmpULT(Diff, VFTimesICTimesSize, "diff.check");
-      SeenCompares.insert({{Diff, VFTimesICTimesSize}, IsConflict});
-      if (NeedsFreeze)
-        IsConflict =
-            ChkBuilder.CreateFreeze(IsConflict, IsConflict->getName() + ".fr");
-      if (MemoryRuntimeCheck) {
-        IsConflict =
-            ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
+          ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
       }
       MemoryRuntimeCheck = IsConflict;
-    }
-  }
-
-  if (AliasLaneMask) {
-    auto *VecVT = VectorType::get(ChkBuilder.getInt1Ty(), VF);
-    // Extend to an i8 since i1 is too small to add with
-    Value *PopCount = ChkBuilder.CreateCast(
-        Instruction::ZExt, AliasLaneMask,
-        VectorType::get(ChkBuilder.getInt8Ty(), VecVT->getElementCount()));
-
-    PopCount =
-        ChkBuilder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, PopCount);
-    PopCount = ChkBuilder.CreateCast(Instruction::ZExt, PopCount,
-                                     ChkBuilder.getInt64Ty());
-    MemoryRuntimeCheck = ChkBuilder.CreateICmpUGT(
-        PopCount, ConstantInt::get(ChkBuilder.getInt64Ty(), 0));
   }
 
   return MemoryRuntimeCheck;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3e39581b681c2..397db3a9c620c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1872,7 +1872,10 @@ class GeneratedRTChecks {
                                  "vector.memcheck");
 
       auto DiffChecks = RtPtrChecking.getDiffChecks();
-      if (DiffChecks) {
+      if (UseSafeEltsMask) {
+        MemRuntimeCheckCond = addSafeEltsRuntimeChecks(
+            MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, VF);
+      } else if (DiffChecks) {
         Value *RuntimeVF = nullptr;
         MemRuntimeCheckCond = addDiffRuntimeChecks(
             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
@@ -1881,7 +1884,7 @@ class GeneratedRTChecks {
                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
               return RuntimeVF;
             },
-            IC, VF, UseSafeEltsMask);
+            IC);
       } else {
         MemRuntimeCheckCond = addRuntimeChecks(
             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),

>From a772c286ac5cbebdd454e4a7da65051763bf9cc2 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Aug 2025 17:09:47 +0100
Subject: [PATCH 10/13] Move RTChecks check closer to use

---
 .../Vectorize/LoopVectorizationPlanner.h      | 12 ++-----
 .../Transforms/Vectorize/LoopVectorize.cpp    | 34 +++++++++----------
 2 files changed, 18 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 4911d2b69e8f8..7671b6acc7535 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -582,21 +582,13 @@ class LoopVectorizationPlanner {
   /// set the largest included VF to the maximum VF for which no plan could be
   /// built. Each VPlan is built starting from a copy of \p InitialPlan, which
   /// is a plain CFG VPlan wrapping the original scalar loop.
-  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
-  /// combining the resulting predicate with an active lane mask if one is in
-  /// use.
   VPlanPtr tryToBuildVPlanWithVPRecipes(VPlanPtr InitialPlan, VFRange &Range,
-                                        LoopVersioning *LVer,
-                                        ArrayRef<PointerDiffInfo> RTChecks);
+                                        LoopVersioning *LVer);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
-  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
-  /// combining the resulting predicate with an active lane mask if one is in
-  /// use.
-  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF,
-                                ArrayRef<PointerDiffInfo> RTChecks);
+  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
 
   // Adjust the recipes for reductions. For in-loop reductions the chain of
   // instructions leading from the loop exit instr to the phi need to be
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 397db3a9c620c..1fbac45e6dca7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2398,6 +2398,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
                                    static_cast<DominatorTree *>(nullptr), LI,
                                    nullptr, "vector.ph");
+
   BranchInst &BI =
       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
@@ -6733,12 +6734,6 @@ void LoopVectorizationPlanner::plan(
   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
     return;
 
-  ArrayRef<PointerDiffInfo> DiffChecks;
-  auto TFStyle = CM.getTailFoldingStyle();
-  if (RTChecks.has_value() &&
-      useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle), TTI))
-    DiffChecks = *RTChecks;
-
   // Invalidate interleave groups if all blocks of loop will be predicated.
   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
       !useMaskedInterleavedAccesses(TTI)) {
@@ -6771,7 +6766,7 @@ void LoopVectorizationPlanner::plan(
       CM.collectInLoopReductions();
       if (CM.selectUserVectorizationFactor(UserVF)) {
         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-        buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecks);
+        buildVPlansWithVPRecipes(UserVF, UserVF);
         LLVM_DEBUG(printPlans(dbgs()));
         return;
       }
@@ -6795,10 +6790,8 @@ void LoopVectorizationPlanner::plan(
     CM.collectNonVectorizedAndSetWideningDecisions(VF);
   }
 
-  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
-                           DiffChecks);
-  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
-                           DiffChecks);
+  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
+  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
 
   LLVM_DEBUG(printPlans(dbgs()));
 }
@@ -8407,9 +8400,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
                                       ScaleFactor, Reduction);
 }
 
-void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
-    ElementCount MinVF, ElementCount MaxVF,
-    ArrayRef<PointerDiffInfo> DiffChecks) {
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
+                                                        ElementCount MaxVF) {
   if (ElementCount::isKnownGT(MinVF, MaxVF))
     return;
 
@@ -8435,8 +8427,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
     if (auto Plan = tryToBuildVPlanWithVPRecipes(
-            std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer,
-            DiffChecks)) {
+            std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
       bool HasScalarVF = Plan->hasScalarVFOnly();
       // Now optimize the initial VPlan.
       if (!HasScalarVF)
@@ -8661,8 +8652,7 @@ static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
 }
 
 VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
-    VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer,
-    ArrayRef<PointerDiffInfo> DiffChecks) {
+    VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
 
   using namespace llvm::VPlanPatternMatch;
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
@@ -8950,6 +8940,14 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
     bool WithoutRuntimeCheck =
         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+
+    ArrayRef<PointerDiffInfo> DiffChecks;
+    std::optional<ArrayRef<PointerDiffInfo>> RTChecks =
+        CM.Legal->getRuntimePointerChecking()->getDiffChecks();
+    if (RTChecks.has_value() &&
+        useSafeEltsMask(Style, CM.getRTCheckStyle(Style), TTI))
+      DiffChecks = *RTChecks;
+
     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
                                        WithoutRuntimeCheck, PSE, DiffChecks);
   }

>From 1a923d48ae09bd6f6d79f92eb225dc741ce05058 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Aug 2025 17:10:35 +0100
Subject: [PATCH 11/13] Don't run extra passes in tests

---
 .../LoopVectorize/AArch64/alias_mask.ll       | 76 ++++++++++++++-----
 1 file changed, 59 insertions(+), 17 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
index acd6beb303954..a9726778cceac 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -1,10 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --filter-out-after "^scalar.ph:" --version 4
-; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize,instcombine,early-cse -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s
 
 define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-LABEL: define dso_local void @alias_mask(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B4:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C]] to i64
 ; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i64 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
@@ -13,19 +17,35 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[B]], ptr [[C]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP0]])
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[DOTNOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[B4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[C3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT6]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK7:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP12]], ptr [[TMP9]], i64 1)
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK7]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP10]])
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP11]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP8]], 4
-; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP8]], 16
+; CHECK-NEXT:    [[TMP15:%.*]] = sub i64 [[N]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ugt i64 [[N]], [[TMP14]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP17]], i64 [[TMP15]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP25:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]]
+; CHECK-NEXT:    [[TMP25:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK7]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP25]], <vscale x 16 x i8> poison)
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
@@ -35,8 +55,9 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP20]], ptr [[TMP21]], i32 1, <vscale x 16 x i1> [[TMP25]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP24]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP13]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = xor i1 [[TMP23]], true
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ;
 entry:
@@ -64,6 +85,10 @@ define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-LABEL: define i32 @alias_mask_read_after_write(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C4:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C2:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[CMP19:%.*]] = icmp sgt i64 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
@@ -72,20 +97,36 @@ define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.raw.mask.nxv4i1(ptr [[C]], ptr [[B]], i64 4)
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 4 x i1> [[ALIAS_LANE_MASK]] to <vscale x 4 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP0]])
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[DOTNOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[C4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[B3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT6]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK7:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.raw.mask.nxv4i1(ptr [[TMP8]], ptr [[TMP12]], i64 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <vscale x 4 x i1> [[ALIAS_LANE_MASK7]] to <vscale x 4 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP10]])
+; CHECK-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP11]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP9]], 2
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = sub i64 [[N]], [[TMP18]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ugt i64 [[N]], [[TMP18]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP31:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK7]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP17]], i32 2, <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
@@ -97,8 +138,9 @@ define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-NEXT:    [[TMP25]] = select <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP27]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP13]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = xor i1 [[TMP28]], true
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ;
 entry:

>From 023ae0549c4c913c031bd60ccd498e2f73e54733 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Aug 2025 17:18:21 +0100
Subject: [PATCH 12/13] Add tests with multiple aliasing pairs

---
 .../LoopVectorize/AArch64/alias_mask.ll       | 190 ++++++++++++++++++
 1 file changed, 190 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
index a9726778cceac..c7344f85f8e02 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -166,3 +166,193 @@ exit:                        ; preds = %entry, %for.body
   %result = phi i32 [ 0, %entry ], [ %add2, %for.body ]
   ret i32 %result
 }
+
+define dso_local void @alias_mask_multiple(ptr %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define dso_local void @alias_mask_multiple(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B7:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[A6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[C5:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[A]], ptr [[C]], i64 1)
+; CHECK-NEXT:    [[ALIAS_LANE_MASK4:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[B]], ptr [[C]], i64 1)
+; CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 16 x i1> [[ALIAS_LANE_MASK]], [[ALIAS_LANE_MASK4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[B7]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[A6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT8]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[C5]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT10]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT11]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT9]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK12:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP9]], ptr [[TMP10]], i64 1)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT11]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK13:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP13]], ptr [[TMP14]], i64 1)
+; CHECK-NEXT:    [[TMP15:%.*]] = and <vscale x 16 x i1> [[ALIAS_LANE_MASK12]], [[ALIAS_LANE_MASK13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <vscale x 16 x i1> [[TMP15]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP16]])
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i8 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 16
+; CHECK-NEXT:    [[TMP21:%.*]] = sub i64 [[N]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ugt i64 [[N]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[TMP15]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP25]], i32 1, <vscale x 16 x i1> [[TMP24]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP26]], i32 1, <vscale x 16 x i1> [[TMP24]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP27:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP27]], ptr [[TMP28]], i32 1, <vscale x 16 x i1> [[TMP24]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP18]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP23]])
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT:    [[TMP30:%.*]] = xor i1 [[TMP29]], true
+; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+;
+entry:
+  %cmp11 = icmp sgt i64 %n, 0
+  br i1 %cmp11, label %for.body, label %exit
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %add = add i8 %load.b, %load.a
+  %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv
+  store i8 %add, ptr %gep.c, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                 ; preds = %for.body, %entry
+  ret void
+}
+
+define i32 @alias_mask_multiple_read_after_write(ptr %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define i32 @alias_mask_multiple_read_after_write(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B7:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[A6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[C5:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[CMP19:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.war.mask.nxv4i1(ptr [[A]], ptr [[C]], i64 4)
+; CHECK-NEXT:    [[ALIAS_LANE_MASK4:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.raw.mask.nxv4i1(ptr [[C]], ptr [[B]], i64 4)
+; CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 4 x i1> [[ALIAS_LANE_MASK]], [[ALIAS_LANE_MASK4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 4 x i1> [[ALIAS_LANE_MASK]] to <vscale x 4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[B7]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT8]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[C5]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT10]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT11]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT9]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK12:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.war.mask.nxv4i1(ptr [[TMP9]], ptr [[TMP10]], i64 4)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK13:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.raw.mask.nxv4i1(ptr [[TMP13]], ptr [[TMP14]], i64 4)
+; CHECK-NEXT:    [[TMP15:%.*]] = and <vscale x 4 x i1> [[ALIAS_LANE_MASK12]], [[ALIAS_LANE_MASK13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <vscale x 4 x i1> [[TMP15]] to <vscale x 4 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP16]])
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i8 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 4
+; CHECK-NEXT:    [[TMP21:%.*]] = sub i64 [[N]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ugt i64 [[N]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[TMP15]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP25]], i32 2, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP26]], i32 2, <vscale x 4 x i1> [[TMP24]])
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP27]], i32 2, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP28:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add <vscale x 4 x i32> [[TMP28]], [[WIDE_MASKED_LOAD14]]
+; CHECK-NEXT:    [[TMP30]] = select <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> [[TMP29]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP18]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP23]])
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT:    [[TMP32:%.*]] = xor i1 [[TMP31]], true
+; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+;
+entry:
+  %cmp19 = icmp sgt i64 %n, 0
+  br i1 %cmp19, label %for.body, label %exit
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add2, %for.body ]
+  %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv
+  %load.a = load i32, ptr %gep.a, align 2
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %load.a, ptr %gep.c, align 2
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %load.b = load i32, ptr %gep.b, align 2
+  %add = add i32 %load.a, %accum
+  %add2 = add i32 %add, %load.b
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                        ; preds = %entry, %for.body
+  %result = phi i32 [ 0, %entry ], [ %add2, %for.body ]
+  ret i32 %result
+}

>From e266411a39460d2a29a331284b724549a0fd8033 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 15 Aug 2025 10:52:20 +0100
Subject: [PATCH 13/13] Remove LV::useSafeEltsMask

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 19 ++++++++-----------
 .../AArch64/induction-costs-sve.ll            | 12 ++++++------
 .../runtime-checks-difference.ll              |  5 ++---
 3 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1fbac45e6dca7..872ae62b7398d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1349,7 +1349,10 @@ class LoopVectorizationCostModel {
                                : ChosenTailFoldingStyle->second;
   }
 
-  RTCheckStyle getRTCheckStyle(TailFoldingStyle TFStyle) const {
+  RTCheckStyle getRTCheckStyle(TailFoldingStyle TFStyle, const TargetTransformInfo &TTI) const {
+    if (!TTI.useSafeEltsMask())
+      return RTCheckStyle::ScalarDifference;
+
     switch (TFStyle) {
     case TailFoldingStyle::Data:
     case TailFoldingStyle::DataAndControlFlow:
@@ -1360,8 +1363,8 @@ class LoopVectorizationCostModel {
     }
   }
 
-  RTCheckStyle getRTCheckStyle() const {
-    return getRTCheckStyle(getTailFoldingStyle());
+  RTCheckStyle getRTCheckStyle(const TargetTransformInfo &TTI) const {
+    return getRTCheckStyle(getTailFoldingStyle(), TTI);
   }
 
   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
@@ -2084,12 +2087,6 @@ static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
 }
 
-static bool useSafeEltsMask(TailFoldingStyle TFStyle, RTCheckStyle Style,
-                            const TargetTransformInfo &TTI) {
-  return useActiveLaneMask(TFStyle) && Style == RTCheckStyle::UseSafeEltsMask &&
-         TTI.useSafeEltsMask();
-}
-
 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
 // vectorization. The loop needs to be annotated with #pragma omp simd
 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
@@ -8945,7 +8942,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     std::optional<ArrayRef<PointerDiffInfo>> RTChecks =
         CM.Legal->getRuntimePointerChecking()->getDiffChecks();
     if (RTChecks.has_value() &&
-        useSafeEltsMask(Style, CM.getRTCheckStyle(Style), TTI))
+        CM.getRTCheckStyle(Style, TTI) == RTCheckStyle::UseSafeEltsMask)
       DiffChecks = *RTChecks;
 
     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
@@ -10155,7 +10152,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     if (VF.Width.isVector() || SelectedIC > 1) {
       TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
       bool UseSafeEltsMask =
-          useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle), *TTI);
+          CM.getRTCheckStyle(TFStyle, *TTI) == RTCheckStyle::UseSafeEltsMask;
       if (UseSafeEltsMask)
         LoopsAliasMasked++;
       Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 6e0a951ae3a43..3cef1f6e03ff9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -88,19 +88,19 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-LABEL: define void @iv_casts(
 ; PRED-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; PRED-NEXT:  [[ENTRY:.*]]:
-; PRED-NEXT:    [[SRC3:%.*]] = ptrtoint ptr [[SRC]] to i64
-; PRED-NEXT:    [[DST2:%.*]] = ptrtoint ptr [[DST]] to i64
+; PRED-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; PRED-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
 ; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; PRED-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; PRED:       [[VECTOR_MEMCHECK]]:
 ; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16
-; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST2]], [[SRC3]]
+; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
 ; PRED-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
-; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
+; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[X]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
@@ -124,7 +124,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP21]] to <vscale x 16 x i8>
 ; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
 ; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP26]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]])
 ; PRED-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; PRED-NEXT:    [[TMP27:%.*]] = xor i1 [[TMP25]], true
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
index e48979c4532a6..b640c1911cb0d 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
@@ -339,13 +339,12 @@ exit:
 ; of the outer loop as start value. It is sufficient to subtract the start
 ; values (%dst, %src) of the outer AddRecs.
 define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %m, i64 noundef %n) {
-;
 ; CHECK-LABEL: define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(
 ; CHECK-SAME: ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i64 noundef [[M:%.*]], i64 noundef [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; CHECK-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[DST1]], [[SRC2]]
 ; CHECK-NEXT:    br label %[[OUTER_LOOP:.*]]
 ; CHECK:       [[OUTER_LOOP]]:
 ; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ]
@@ -353,7 +352,7 @@ define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr noca
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
-; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[SUB]], 16
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], [[SCALAR_PH]], [[VECTOR_PH:label %.*]]
 ;
 entry: