[llvm-branch-commits] [llvm] [LV] Mask off possibly aliasing vector lanes (PR #100579)

Fri Aug 15 09:17:27 PDT 2025

https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/100579

>From 19972e24a852cf46084ef2d6bb931950493b8095 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 18 Oct 2024 15:49:22 +0100
Subject: [PATCH 01/15] [LV] Mask off possibly aliasing vector lanes

    When vectorising a loop that uses loads and stores, those pointers could
    overlap if their difference is less than the vector factor. For example,
    if address 20 is being stored to and address 23 is being loaded from, they
    overlap when the vector factor is 4 or higher. Currently LoopVectorize
    branches to a scalar loop in these cases with a runtime check. Howver if
    we construct a mask that disables the overlapping (aliasing) lanes then
    the vectorised loop can be safely entered, as long as the loads and
    stores are masked off.
---
 .../llvm/Analysis/LoopAccessAnalysis.h        |   5 +-
 .../llvm/Analysis/TargetTransformInfo.h       |   7 +
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |   5 +-
 llvm/lib/Transforms/Utils/LoopUtils.cpp       |   3 +-
 .../Vectorize/LoopVectorizationPlanner.h      |  22 ++-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 101 +++++++---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  52 +++++
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   4 +
 .../Vectorize/VPlanConstruction.cpp           |  30 +--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  62 +++++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  70 +++++--
 .../Transforms/Vectorize/VPlanTransforms.h    |  11 +-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   1 +
 .../LoopVectorize/AArch64/alias_mask.ll       | 185 ++++++++++++++++++
 .../AArch64/induction-costs-sve.ll            |  65 ++++--
 .../runtime-checks-difference.ll              |   5 +-
 .../vplan-printing-predicated.ll              | 115 +++++++++++
 17 files changed, 662 insertions(+), 81 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 92304edd67a44..5651b597a77fc 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -489,11 +489,12 @@ struct PointerDiffInfo {
   const SCEV *SinkStart;
   unsigned AccessSize;
   bool NeedsFreeze;
+  bool WriteAfterRead;
 
   PointerDiffInfo(const SCEV *SrcStart, const SCEV *SinkStart,
-                  unsigned AccessSize, bool NeedsFreeze)
+                  unsigned AccessSize, bool NeedsFreeze, bool WriteAfterRead)
       : SrcStart(SrcStart), SinkStart(SinkStart), AccessSize(AccessSize),
-        NeedsFreeze(NeedsFreeze) {}
+        NeedsFreeze(NeedsFreeze), WriteAfterRead(WriteAfterRead) {}
 };
 
 /// Holds information about the memory runtime legality checks to verify
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 9186419715cc4..38a46d18b5b54 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -201,6 +201,13 @@ enum class TailFoldingStyle {
   DataWithEVL,
 };
 
+enum class RTCheckStyle {
+  /// Branch to scalar loop if checks fails at runtime.
+  ScalarFallback,
+  /// Form a mask based on elements which won't be a WAR or RAW hazard.
+  UseSafeEltsMask,
+};
+
 struct TailFoldingInfo {
   TargetLibraryInfo *TLI;
   LoopVectorizationLegality *LVL;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 62baf9b632bc7..43fa20b3c81af 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -482,11 +482,14 @@ bool RuntimePointerChecking::tryToCreateDiffCheck(
     }
   }
 
+  bool WriteAfterRead = isa<LoadInst>(SrcInsts[0]);
+
   LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
                     << "SrcStart: " << *SrcStartInt << '\n'
                     << "SinkStartInt: " << *SinkStartInt << '\n');
   DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize,
-                          Src->NeedsFreeze || Sink->NeedsFreeze);
+                          Src->NeedsFreeze || Sink->NeedsFreeze,
+                          WriteAfterRead);
   return true;
 }
 
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 2d830f3b6f952..2d3e7f85ee3a8 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -2034,7 +2034,8 @@ Value *llvm::addDiffRuntimeChecks(
   // Map to keep track of created compares, The key is the pair of operands for
   // the compare, to allow detecting and re-using redundant compares.
   DenseMap<std::pair<Value *, Value *>, Value *> SeenCompares;
-  for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze] : Checks) {
+  for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze,
+                    WriteAfterRead] : Checks) {
     Type *Ty = SinkStart->getType();
     // Compute VF * IC * AccessSize.
     auto *VFTimesICTimesSize =
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 4856ebebb596f..b5d9d6e4b7837 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -493,7 +493,13 @@ class LoopVectorizationPlanner {
   /// Build VPlans for the specified \p UserVF and \p UserIC if they are
   /// non-zero or all applicable candidate VFs otherwise. If vectorization and
   /// interleaving should be avoided up-front, no plans are generated.
-  void plan(ElementCount UserVF, unsigned UserIC);
+  /// DiffChecks is a list of pointer pairs that should be checked for aliasing,
+  /// setting HasAliasMask to true in the case that an alias mask is generated
+  /// and the vector loop should be entered even if the pointers alias across a
+  /// loop iteration.
+  void plan(ElementCount UserVF, unsigned UserIC,
+            std::optional<ArrayRef<PointerDiffInfo>> DiffChecks,
+            bool &HasAliasMask);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
@@ -578,13 +584,23 @@ class LoopVectorizationPlanner {
   /// set the largest included VF to the maximum VF for which no plan could be
   /// built. Each VPlan is built starting from a copy of \p InitialPlan, which
   /// is a plain CFG VPlan wrapping the original scalar loop.
+  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
+  /// setting HasAliasMask to true in the case that an alias mask is generated
+  /// and the vector loop should be entered even if the pointers alias across a
+  /// loop iteration.
   VPlanPtr tryToBuildVPlanWithVPRecipes(VPlanPtr InitialPlan, VFRange &Range,
-                                        LoopVersioning *LVer);
+                                        LoopVersioning *LVer,
+                                        ArrayRef<PointerDiffInfo> RTChecks,
+                                        bool &HasAliasMask);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
-  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
+  /// RTChecks contains a list of pointer pairs that an alias mask should be
+  /// generated for.
+  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF,
+                                ArrayRef<PointerDiffInfo> RTChecks,
+                                bool &HasAliasMask);
 
   // Adjust the recipes for reductions. For in-loop reductions the chain of
   // instructions leading from the loop exit instr to the phi need to be
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 09c9e63ff6a25..6900dbb217d07 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -178,6 +178,7 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
 STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
+STATISTIC(LoopsAliasMasked, "Number of loops predicated with an alias mask");
 
 static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -566,7 +567,11 @@ class InnerLoopVectorizer {
   /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
   /// vector preheader and its predecessor, also connecting the new block to the
   /// scalar preheader.
-  void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
+  /// If HasAliasMask is true then the vector loop will be branched to
+  /// unconditionally, instead of there being a conditional branch to the scalar
+  /// loop or vector loop
+  void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB,
+                                  bool HasAliasMask = false);
 
   /// The original loop.
   Loop *OrigLoop;
@@ -1343,6 +1348,21 @@ class LoopVectorizationCostModel {
                                : ChosenTailFoldingStyle->second;
   }
 
+  RTCheckStyle getRTCheckStyle(TailFoldingStyle TFStyle) const {
+    switch (TFStyle) {
+    case TailFoldingStyle::Data:
+    case TailFoldingStyle::DataAndControlFlow:
+    case TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck:
+      return RTCheckStyle::UseSafeEltsMask;
+    default:
+      return RTCheckStyle::ScalarFallback;
+    }
+  }
+
+  RTCheckStyle getRTCheckStyle() const {
+    return getRTCheckStyle(getTailFoldingStyle());
+  }
+
   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
   /// overflow or not.
   /// \param IsScalableVF true if scalable vector factors enabled.
@@ -1798,6 +1818,10 @@ class GeneratedRTChecks {
   TTI::TargetCostKind CostKind;
 
 public:
+  /// Set by VPlan when the vector loop should be entered even when runtime
+  /// checks determine that pointers alias within an iteration.
+  bool HasAliasMask = false;
+
   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
                     LoopInfo *LI, TargetTransformInfo *TTI,
                     const DataLayout &DL, TTI::TargetCostKind CostKind)
@@ -2059,6 +2083,10 @@ static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
 }
 
+static bool useSafeEltsMask(TailFoldingStyle TFStyle, RTCheckStyle Style) {
+  return useActiveLaneMask(TFStyle) && Style == RTCheckStyle::UseSafeEltsMask;
+}
+
 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
 // vectorization. The loop needs to be annotated with #pragma omp simd
 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
@@ -2268,7 +2296,7 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
   return TTI.enableMaskedInterleavedAccessVectorization();
 }
 
-void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
+void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB, bool HasAliasMask) {
   // Note: The block with the minimum trip-count check is already connected
   // during earlier VPlan construction.
   VPBlockBase *ScalarPH = Plan.getScalarPreheader();
@@ -2278,17 +2306,19 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
   VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
   VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPBB, CheckVPIRBB);
   PreVectorPH = CheckVPIRBB;
-  VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
-  PreVectorPH->swapSuccessors();
-
-  // We just connected a new block to the scalar preheader. Update all
-  // VPPhis by adding an incoming value for it, replicating the last value.
-  unsigned NumPredecessors = ScalarPH->getNumPredecessors();
-  for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
-    assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
-    assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
-           "must have incoming values for all operands");
-    R.addOperand(R.getOperand(NumPredecessors - 2));
+  if (!HasAliasMask) {
+      VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
+      PreVectorPH->swapSuccessors();
+
+      // We just connected a new block to the scalar preheader. Update all
+      // VPPhis by adding an incoming value for it, replicating the last value.
+      unsigned NumPredecessors = ScalarPH->getNumPredecessors();
+      for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
+        assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
+        assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
+               "must have incoming values for all operands");
+        R.addOperand(R.getOperand(NumPredecessors - 2));
+      }
   }
 }
 
@@ -2370,7 +2400,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
                                    static_cast<DominatorTree *>(nullptr), LI,
                                    nullptr, "vector.ph");
-
   BranchInst &BI =
       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
@@ -6694,7 +6723,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
   return VectorizationFactor::Disabled();
 }
 
-void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
+void LoopVectorizationPlanner::plan(
+    ElementCount UserVF, unsigned UserIC,
+    std::optional<ArrayRef<PointerDiffInfo>> RTChecks, bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
@@ -6703,6 +6734,12 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
     return;
 
+  ArrayRef<PointerDiffInfo> DiffChecks;
+  auto TFStyle = CM.getTailFoldingStyle();
+  if (RTChecks.has_value() &&
+      useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle)))
+    DiffChecks = *RTChecks;
+
   // Invalidate interleave groups if all blocks of loop will be predicated.
   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
       !useMaskedInterleavedAccesses(TTI)) {
@@ -6735,7 +6772,7 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.collectInLoopReductions();
       if (CM.selectUserVectorizationFactor(UserVF)) {
         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-        buildVPlansWithVPRecipes(UserVF, UserVF);
+        buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecks, HasAliasMask);
         LLVM_DEBUG(printPlans(dbgs()));
         return;
       }
@@ -6759,8 +6796,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
     CM.collectNonVectorizedAndSetWideningDecisions(VF);
   }
 
-  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
-  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
+  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
+                           DiffChecks, HasAliasMask);
+  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
+                           DiffChecks, HasAliasMask);
 
   LLVM_DEBUG(printPlans(dbgs()));
 }
@@ -8368,7 +8407,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
-                                                        ElementCount MaxVF) {
+    ElementCount MaxVF, ArrayRef<PointerDiffInfo> RTChecks,
+    bool &HasAliasMask) {
   if (ElementCount::isKnownGT(MinVF, MaxVF))
     return;
 
@@ -8394,7 +8434,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
     if (auto Plan = tryToBuildVPlanWithVPRecipes(
-            std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
+            std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer, RTChecks, HasAliasMask)) {
       bool HasScalarVF = Plan->hasScalarVFOnly();
       // Now optimize the initial VPlan.
       if (!HasScalarVF)
@@ -8619,7 +8659,7 @@ static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
 }
 
 VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
-    VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
+    VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer, ArrayRef<PointerDiffInfo> RTChecks, bool &HasAliasMask) {
 
   using namespace llvm::VPlanPatternMatch;
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
@@ -8908,7 +8948,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     bool WithoutRuntimeCheck =
         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
-                                       WithoutRuntimeCheck);
+                                       WithoutRuntimeCheck, PSE, RTChecks);
+    if (ForControlFlow && !RTChecks.empty())
+      HasAliasMask = true;
   }
   VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues);
 
@@ -9324,7 +9366,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
             CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
            "Cannot SCEV check stride or overflow when optimizing for size");
     VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
-                                      HasBranchWeights);
+                                      HasBranchWeights, RTChecks.HasAliasMask);
   }
   const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
   if (MemCheckBlock) {
@@ -9349,7 +9391,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
       });
     }
     VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock,
-                                      HasBranchWeights);
+                                      HasBranchWeights, RTChecks.HasAliasMask);
   }
 }
 
@@ -9480,6 +9522,7 @@ static bool processLoopInVPlanNativePath(
   // Mark the loop as already vectorized to avoid vectorizing again.
   Hints.setAlreadyVectorized();
   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
+
   return true;
 }
 
@@ -10094,15 +10137,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   ElementCount UserVF = Hints.getWidth();
   unsigned UserIC = Hints.getInterleave();
 
+  GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
+                           CM.CostKind);
+
   // Plan how to best vectorize.
-  LVP.plan(UserVF, UserIC);
+  LVP.plan(UserVF, UserIC,
+           LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(),
+           Checks.HasAliasMask);
   VectorizationFactor VF = LVP.computeBestVF();
+  if (Checks.HasAliasMask)
+    LoopsAliasMasked++;
   unsigned IC = 1;
 
   if (ORE->allowExtraAnalysis(LV_NAME))
     LVP.emitInvalidCostRemarks(ORE);
 
-  GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
   if (LVP.hasPlanWithVF(VF.Width)) {
     // Select the interleave count.
     IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 02b73c3f1dba9..fbf12c9483641 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -529,6 +529,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     switch (R->getVPDefID()) {
     case VPRecipeBase::VPDerivedIVSC:
     case VPRecipeBase::VPEVLBasedIVPHISC:
+    case VPRecipeBase::VPAliasLaneMaskSC:
     case VPRecipeBase::VPExpandSCEVSC:
     case VPRecipeBase::VPExpressionSC:
     case VPRecipeBase::VPInstructionSC:
@@ -987,6 +988,7 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
     // during unrolling.
     ExtractPenultimateElement,
     LogicalAnd, // Non-poison propagating logical And.
+    PopCount,
     // Add an offset in bytes (second operand) to a base pointer (first
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
@@ -3236,6 +3238,56 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   }
 };
 
+// Given a pointer A that is being laoded from, and pointer B that is being
+// stored to, both with unknown lengths, create a mask that disables
+// elements which could overlap across a loop iteration. For example, if A
+// is X and B is X + 2 with VF being 4, only the first two elements of the
+// loaded vector can be stored since they don't overlap with the stored
+// vector. %a.vec = load %a ; = [s, t, u, v]
+// [...]
+// store %b, %a.vec ; only s and t can be stored as their addresses don't
+// overlap with %a + (VF - 1)
+class VPAliasLaneMaskRecipe : public VPSingleDefRecipe {
+
+public:
+  VPAliasLaneMaskRecipe(VPValue *Src, VPValue *Sink, unsigned ElementSize,
+                        bool WriteAfterRead)
+      : VPSingleDefRecipe(VPDef::VPAliasLaneMaskSC, {Src, Sink}),
+        ElementSize(ElementSize), WriteAfterRead(WriteAfterRead) {}
+
+  ~VPAliasLaneMaskRecipe() override = default;
+
+  VPAliasLaneMaskRecipe *clone() override {
+    return new VPAliasLaneMaskRecipe(getSourceValue(), getSinkValue(),
+                                     ElementSize, WriteAfterRead);
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPAliasLaneMaskSC);
+
+  void execute(VPTransformState &State) override;
+
+  /// Get the VPValue* for the pointer being read from
+  VPValue *getSourceValue() const { return getOperand(0); }
+
+  // Get the size of the element(s) accessed by the pointers
+  unsigned getAccessedElementSize() const { return ElementSize; }
+
+  /// Get the VPValue* for the pointer being stored to
+  VPValue *getSinkValue() const { return getOperand(1); }
+
+  bool isWriteAfterRead() const { return WriteAfterRead; }
+
+private:
+  unsigned ElementSize;
+  bool WriteAfterRead;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// Recipe to expand a SCEV expression.
 class VPExpandSCEVRecipe : public VPSingleDefRecipe {
   const SCEV *Expr;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index b39231f106300..0861a34ac3e42 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -134,6 +134,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   case VPInstruction::BranchOnCond:
   case VPInstruction::BranchOnCount:
     return Type::getVoidTy(Ctx);
+  case VPInstruction::PopCount:
+    return Type::getInt64Ty(Ctx);
   default:
     break;
   }
@@ -308,6 +310,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
           })
           .Case<VPExpressionRecipe>([this](const auto *R) {
             return inferScalarType(R->getOperandOfResultType());
+          }).Case<VPAliasLaneMaskRecipe>([this](const VPAliasLaneMaskRecipe *R) {
+            return Type::getInt1Ty(Ctx);
           });
 
   assert(ResultTy && "could not infer type for the given VPValue");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index b231a8429503f..4d8080df02930 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -638,24 +638,26 @@ static constexpr uint32_t CheckBypassWeights[] = {1, 127};
 
 void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
                                        BasicBlock *CheckBlock,
-                                       bool AddBranchWeights) {
-  VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
+                                       bool AddBranchWeights, bool HasAliasMask) {
   VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock);
   VPBlockBase *VectorPH = Plan.getVectorPreheader();
-  VPBlockBase *ScalarPH = Plan.getScalarPreheader();
   VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
   VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckBlockVPBB);
-  VPBlockUtils::connectBlocks(CheckBlockVPBB, ScalarPH);
-  CheckBlockVPBB->swapSuccessors();
-
-  // We just connected a new block to the scalar preheader. Update all
-  // VPPhis by adding an incoming value for it, replicating the last value.
-  unsigned NumPredecessors = ScalarPH->getNumPredecessors();
-  for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
-    assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
-    assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
-           "must have incoming values for all operands");
-    R.addOperand(R.getOperand(NumPredecessors - 2));
+  VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
+  if (!HasAliasMask) {
+    VPBlockBase *ScalarPH = Plan.getScalarPreheader();
+    VPBlockUtils::connectBlocks(CheckBlockVPBB, ScalarPH);
+    CheckBlockVPBB->swapSuccessors();
+
+    // We just connected a new block to the scalar preheader. Update all
+    // VPPhis by adding an incoming value for it, replicating the last value.
+    unsigned NumPredecessors = ScalarPH->getNumPredecessors();
+    for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
+      assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
+      assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
+             "must have incoming values for all operands");
+      R.addOperand(R.getOperand(NumPredecessors - 2));
+    }
   }
 
   VPIRMetadata VPBranchWeights;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index fa62547d374cd..a2d758bbd7d7c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -519,6 +519,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::ExplicitVectorLength:
   case VPInstruction::AnyOf:
   case VPInstruction::Not:
+  case VPInstruction::PopCount:
     return true;
   default:
     return false;
@@ -625,6 +626,29 @@ Value *VPInstruction::generate(VPTransformState &State) {
                                    {PredTy, ScalarTC->getType()},
                                    {VIVElem0, ScalarTC}, nullptr, Name);
   }
+  // Count the number of bits set in each lane and reduce the result to a scalar
+  case VPInstruction::PopCount: {
+    Value *Op = State.get(getOperand(0));
+    Type *VT = Op->getType();
+    Value *Cnt = Op;
+
+    // i1 vectors can just use the add reduction. Bigger elements need a ctpop
+    // first.
+    if (VT->getScalarSizeInBits() > 1)
+      Cnt = Builder.CreateIntrinsic(Intrinsic::ctpop, {VT}, {Cnt});
+
+    auto *VecVT = cast<VectorType>(VT);
+    // Extend to an i8 since i1 is too small to add with
+    if (VecVT->getElementType()->getScalarSizeInBits() < 8) {
+      Cnt = Builder.CreateCast(
+          Instruction::ZExt, Cnt,
+          VectorType::get(Builder.getInt8Ty(), VecVT->getElementCount()));
+    }
+
+    Cnt = Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, Cnt);
+    Cnt = Builder.CreateCast(Instruction::ZExt, Cnt, Builder.getInt64Ty());
+    return Cnt;
+  }
   case VPInstruction::FirstOrderRecurrenceSplice: {
     // Generate code to combine the previous and current values in vector v3.
     //
@@ -1033,7 +1057,7 @@ bool VPInstruction::isVectorToScalar() const {
          getOpcode() == VPInstruction::ComputeAnyOfResult ||
          getOpcode() == VPInstruction::ComputeFindIVResult ||
          getOpcode() == VPInstruction::ComputeReductionResult ||
-         getOpcode() == VPInstruction::AnyOf;
+         getOpcode() == VPInstruction::AnyOf || getOpcode() == PopCount;
 }
 
 bool VPInstruction::isSingleScalar() const {
@@ -1206,6 +1230,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ActiveLaneMask:
     O << "active lane mask";
     break;
+  case VPInstruction::PopCount:
+    O << "popcount";
+    break;
   case VPInstruction::ExplicitVectorLength:
     O << "EXPLICIT-VECTOR-LENGTH";
     break;
@@ -3748,6 +3775,39 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+void VPAliasLaneMaskRecipe::execute(VPTransformState &State) {
+  IRBuilderBase Builder = State.Builder;
+  Value *SinkValue = State.get(getSinkValue(), true);
+  Value *SourceValue = State.get(getSourceValue(), true);
+
+  unsigned IntrinsicID = WriteAfterRead
+                             ? Intrinsic::loop_dependence_war_mask
+                             : Intrinsic::loop_dependence_raw_mask;
+  Value *SourceAsPtr = Builder.CreateCast(Instruction::IntToPtr, SourceValue,
+                                          Builder.getPtrTy());
+  Value *SinkAsPtr =
+      Builder.CreateCast(Instruction::IntToPtr, SinkValue, Builder.getPtrTy());
+  Value *AliasMask = Builder.CreateIntrinsic(
+      IntrinsicID, {VectorType::get(Builder.getInt1Ty(), State.VF)},
+      {SourceAsPtr, SinkAsPtr, Builder.getInt64(getAccessedElementSize())},
+      nullptr, "alias.lane.mask");
+  State.set(this, AliasMask, /*IsScalar=*/false);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPAliasLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
+                                  VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+  getVPSingleValue()->printAsOperand(O, SlotTracker);
+  O << " = ALIAS-LANE-MASK ";
+  getSourceValue()->printAsOperand(O, SlotTracker);
+  O << ", ";
+  getSinkValue()->printAsOperand(O, SlotTracker);
+  O << " (" << (WriteAfterRead ? "write-after-read" : "read-after-write")
+    << ")";
+}
+#endif
+
 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
   assert(!State.Lane && "cannot be used in per-lane");
   const DataLayout &DL = SE.getDataLayout();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 22e02abf229d7..4be54a8eaf15b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1978,8 +1978,9 @@ void VPlanTransforms::optimize(VPlan &Plan) {
 //   %Negated = Not %ALM
 //   branch-on-cond %Negated
 //
-static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
-    VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
+static VPValue *addVPLaneMaskPhiAndUpdateExitBranch(
+    VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck,
+    PredicatedScalarEvolution &PSE, ArrayRef<PointerDiffInfo> RTChecks) {
   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
   auto *CanonicalIVPHI = Plan.getCanonicalIV();
@@ -1989,14 +1990,36 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
   // TODO: Check if dropping the flags is needed if
   // !DataAndControlFlowWithoutRuntimeCheck.
+  VPValue *IncVal = CanonicalIVIncrement->getOperand(1);
+  assert(IncVal != CanonicalIVPHI && "Unexpected operand order");
+
   CanonicalIVIncrement->dropPoisonGeneratingFlags();
   DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
+
   // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
   // we have to take unrolling into account. Each part needs to start at
   //   Part * VF
   auto *VecPreheader = Plan.getVectorPreheader();
   VPBuilder Builder(VecPreheader);
 
+  // Create an alias mask for each possibly-aliasing pointer pair. If there
+  // are multiple they are combined together with ANDs.
+  VPValue *AliasMask = nullptr;
+
+  for (auto C : RTChecks) {
+    VPValue *Sink =
+        vputils::getOrCreateVPValueForSCEVExpr(Plan, C.SinkStart, *PSE.getSE());
+    VPValue *Src =
+        vputils::getOrCreateVPValueForSCEVExpr(Plan, C.SrcStart, *PSE.getSE());
+    VPAliasLaneMaskRecipe *M =
+        new VPAliasLaneMaskRecipe(Src, Sink, C.AccessSize, C.WriteAfterRead);
+    VecPreheader->appendRecipe(M);
+    if (AliasMask)
+      AliasMask = Builder.createAnd(AliasMask, M);
+    else
+      AliasMask = M;
+  }
+
   // Create the ActiveLaneMask instruction using the correct start values.
   VPValue *TC = Plan.getTripCount();
 
@@ -2020,14 +2043,35 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       "index.part.next");
 
   // Create the active lane mask instruction in the VPlan preheader.
-  auto *EntryALM =
+  VPValue *Mask =
       Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
                            DL, "active.lane.mask.entry");
 
   // Now create the ActiveLaneMaskPhi recipe in the main loop using the
   // preheader ActiveLaneMask instruction.
-  auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
+  auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(Mask, DebugLoc());
   LaneMaskPhi->insertAfter(CanonicalIVPHI);
+  VPValue *LaneMask = LaneMaskPhi;
+  if (AliasMask) {
+    // Increment phi by correct amount.
+    Builder.setInsertPoint(CanonicalIVIncrement);
+
+    VPValue *IncrementBy = Builder.createNaryOp(VPInstruction::PopCount,
+                                                {AliasMask}, DL, "popcount");
+    Type *IVType = CanonicalIVPHI->getScalarType();
+
+    if (IVType->getScalarSizeInBits() < 64)
+      IncrementBy =
+          Builder.createScalarCast(Instruction::Trunc, IncrementBy, IVType,
+                                   CanonicalIVIncrement->getDebugLoc());
+    CanonicalIVIncrement->setOperand(1, IncrementBy);
+
+    // And the alias mask so the iteration only processes non-aliasing lanes
+    Builder.setInsertPoint(CanonicalIVPHI->getParent(),
+                           CanonicalIVPHI->getParent()->getFirstNonPhi());
+    LaneMask = Builder.createNaryOp(Instruction::BinaryOps::And,
+                                    {LaneMaskPhi, AliasMask}, DL);
+  }
 
   // Create the active lane mask for the next iteration of the loop before the
   // original terminator.
@@ -2046,7 +2090,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
   auto *NotMask = Builder.createNot(ALM, DL);
   Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
   OriginalTerminator->eraseFromParent();
-  return LaneMaskPhi;
+  return LaneMask;
 }
 
 /// Collect the header mask with the pattern:
@@ -2097,24 +2141,26 @@ static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) {
 
 void VPlanTransforms::addActiveLaneMask(
     VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
-    bool DataAndControlFlowWithoutRuntimeCheck) {
+    bool DataAndControlFlowWithoutRuntimeCheck, PredicatedScalarEvolution &PSE,
+    ArrayRef<PointerDiffInfo> RTChecks) {
+
   assert((!DataAndControlFlowWithoutRuntimeCheck ||
           UseActiveLaneMaskForControlFlow) &&
          "DataAndControlFlowWithoutRuntimeCheck implies "
          "UseActiveLaneMaskForControlFlow");
 
-  auto *FoundWidenCanonicalIVUser =
-      find_if(Plan.getCanonicalIV()->users(),
-              [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
-  assert(FoundWidenCanonicalIVUser &&
+  auto IVUsers = Plan.getCanonicalIV()->users();
+  auto *FoundWidenCanonicalIVUser = find_if(
+      IVUsers, [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
+  assert(FoundWidenCanonicalIVUser != IVUsers.end() &&
          "Must have widened canonical IV when tail folding!");
   VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
   auto *WideCanonicalIV =
       cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
-  VPSingleDefRecipe *LaneMask;
+  VPValue *LaneMask;
   if (UseActiveLaneMaskForControlFlow) {
     LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
-        Plan, DataAndControlFlowWithoutRuntimeCheck);
+        Plan, DataAndControlFlowWithoutRuntimeCheck, PSE, RTChecks);
   } else {
     VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
     LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 35fa45ced53e0..a23ab1c09d253 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -89,9 +89,9 @@ struct VPlanTransforms {
 
   /// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
   /// VPValue and connect the block to \p Plan, using the VPValue as branch
-  /// condition.
+  /// condition. If an alias mask has been set up then the checkblock won't branch to the scalar preheader.
   static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock,
-                               bool AddBranchWeights);
+                               bool AddBranchWeights, bool HasAliasMask);
 
   /// Replaces the VPInstructions in \p Plan with corresponding
   /// widen recipes. Returns false if any VPInstructions could not be converted
@@ -155,9 +155,14 @@ struct VPlanTransforms {
   /// creation) and instead it is handled using active-lane-mask. \p
   /// DataAndControlFlowWithoutRuntimeCheck implies \p
   /// UseActiveLaneMaskForControlFlow.
+  /// PSE is the SCEV expander used for values in runtime checks.
+  /// RTChecks refers to the pointer pairs that need aliasing elements to be
+  /// masked off each loop iteration.
   static void addActiveLaneMask(VPlan &Plan,
                                 bool UseActiveLaneMaskForControlFlow,
-                                bool DataAndControlFlowWithoutRuntimeCheck);
+                                bool DataAndControlFlowWithoutRuntimeCheck,
+                                PredicatedScalarEvolution &PSE,
+                                ArrayRef<PointerDiffInfo> RTChecks);
 
   /// Insert truncates and extends for any truncated recipe. Redundant casts
   /// will be folded later.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 24f6d61512ef6..be73a4b72965b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -331,6 +331,7 @@ class VPDef {
   using VPRecipeTy = enum {
     VPBranchOnMaskSC,
     VPDerivedIVSC,
+    VPAliasLaneMaskSC,
     VPExpandSCEVSC,
     VPExpressionSC,
     VPIRInstructionSC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
new file mode 100644
index 0000000000000..4f1696b7d7e1f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --filter-out-after "^scalar.ph:" --version 4
+; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s
+
+define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define dso_local void @alias_mask(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C2:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[C2]], [[B3]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP7]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP6]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP5]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[B2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[C1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT6]], i32 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+; CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP14]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP30]], ptr [[TMP31]], i64 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP8]], 16
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[TMP15]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt i64 [[N]], [[TMP15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i64 [[TMP11]], i64 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP25]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP25]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD5]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP20]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP23:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP28]])
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP23]] to i64
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP24]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP13]])
+; CHECK-NEXT:    [[TMP26:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <vscale x 16 x i1> [[TMP26]], i32 0
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+;
+entry:
+  %cmp11 = icmp sgt i64 %n, 0
+  br i1 %cmp11, label %for.body, label %exit
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %add = add i8 %load.b, %load.a
+  %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv
+  store i8 %add, ptr %gep.c, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                 ; preds = %for.body, %entry
+  ret void
+}
+
+define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define i32 @alias_mask_read_after_write(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C2:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[CMP19:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[B2]], [[C3]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP8]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP7]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[C2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[B1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT6]], i32 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP28]] to ptr
+; CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP15]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.raw.mask.nxv4i1(ptr [[TMP34]], ptr [[TMP35]], i64 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[N]], [[TMP16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP16]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 2, <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP20]], i32 2, <vscale x 4 x i1> [[TMP31]])
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP22]], i32 2, <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP23:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 4 x i32> [[TMP23]], [[WIDE_MASKED_LOAD5]]
+; CHECK-NEXT:    [[TMP25]] = select <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP32:%.*]] = zext <vscale x 4 x i1> [[ALIAS_LANE_MASK]] to <vscale x 4 x i8>
+; CHECK-NEXT:    [[TMP26:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP32]])
+; CHECK-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP26]] to i64
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP27]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
+; CHECK-NEXT:    [[TMP29:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 4 x i1> [[TMP29]], i32 0
+; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+;
+entry:
+  %cmp19 = icmp sgt i64 %n, 0
+  br i1 %cmp19, label %for.body, label %exit
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add2, %for.body ]
+  %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv
+  %load.a = load i32, ptr %gep.a, align 2
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %load.a, ptr %gep.c, align 2
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %load.b = load i32, ptr %gep.b, align 2
+  %add = add i32 %load.a, %accum
+  %add2 = add i32 %add, %load.b
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                        ; preds = %entry, %for.body
+  %result = phi i32 [ 0, %entry ], [ %add2, %for.body ]
+  ret i32 %result
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 137e07336fd50..a034661432790 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -28,6 +28,8 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
 ; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP10]]
 ; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[X]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[TMP13:%.*]] = trunc <vscale x 8 x i32> [[BROADCAST_SPLAT]] to <vscale x 8 x i16>
@@ -58,7 +60,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP38]], i64 [[TMP42]]
 ; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP36]], ptr [[TMP38]], align 1
 ; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP37]], ptr [[TMP43]], align 1
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
 ; DEFAULT-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
@@ -90,19 +92,36 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:  [[ENTRY:.*]]:
 ; PRED-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; PRED-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; PRED-NEXT:    [[SRC3:%.*]] = ptrtoint ptr [[SRC]] to i64
+; PRED-NEXT:    [[DST2:%.*]] = ptrtoint ptr [[DST]] to i64
 ; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; PRED-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; PRED:       [[VECTOR_MEMCHECK]]:
 ; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16
-; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST2]], [[SRC3]]
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
-; PRED-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; PRED-NEXT:    br label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
 ; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
+; PRED-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP5]], 1
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP8]]
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[SRC2]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[DST1]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[X]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT6]], i32 0
+; PRED-NEXT:    [[TMP33:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT1]], i32 0
+; PRED-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+; PRED-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP25]] to ptr
+; PRED-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP34]], ptr [[TMP35]], i64 1)
 ; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 4
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
@@ -113,7 +132,8 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; PRED:       [[VECTOR_BODY]]:
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK1]], [[ALIAS_LANE_MASK]]
 ; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
 ; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; PRED-NEXT:    [[TMP17:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
@@ -123,16 +143,20 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP21:%.*]] = lshr <vscale x 16 x i16> [[TMP20]], splat (i16 1)
 ; PRED-NEXT:    [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP21]] to <vscale x 16 x i8>
 ; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP26]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
+; PRED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
+; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP27]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    [[TMP30:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i8>
+; PRED-NEXT:    [[TMP31:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP30]])
+; PRED-NEXT:    [[TMP32:%.*]] = zext i8 [[TMP31]] to i64
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP32]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT:    [[TMP27:%.*]] = xor i1 [[TMP25]], true
-; PRED-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PRED-NEXT:    [[TMP28:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 16 x i1> [[TMP28]], i32 0
+; PRED-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
 ; PRED:       [[SCALAR_PH]]:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; PRED-NEXT:    br label %[[LOOP:.*]]
 ; PRED:       [[LOOP]]:
 ; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -262,6 +286,9 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
 ; PRED-NEXT:    br i1 [[TMP12]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 2
 ; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 2
 ; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
@@ -293,9 +320,9 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED:       [[PRED_STORE_CONTINUE2]]:
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT:    [[TMP25:%.*]] = xor i1 [[TMP24]], true
+; PRED-NEXT:    [[TMP24:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
 ; PRED-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; PRED-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[TMP24]], i32 0
 ; PRED-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
@@ -430,6 +457,9 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
 ; PRED-NEXT:    br i1 [[TMP13]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP14:%.*]] = sub i64 [[TMP0]], 4
 ; PRED-NEXT:    [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4
 ; PRED-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
@@ -482,9 +512,9 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED:       [[PRED_STORE_CONTINUE7]]:
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]])
-; PRED-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT:    [[TMP36:%.*]] = xor i1 [[TMP35]], true
+; PRED-NEXT:    [[TMP35:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
 ; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; PRED-NEXT:    [[TMP36:%.*]] = extractelement <4 x i1> [[TMP35]], i32 0
 ; PRED-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
@@ -621,6 +651,9 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
 ; PRED-NEXT:    br i1 [[TMP12]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 4
 ; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 4
 ; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
@@ -673,9 +706,9 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED:       [[PRED_STORE_CONTINUE6]]:
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT:    [[TMP35:%.*]] = xor i1 [[TMP34]], true
+; PRED-NEXT:    [[TMP34:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
 ; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; PRED-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP34]], i32 0
 ; PRED-NEXT:    br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
index b640c1911cb0d..e48979c4532a6 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
@@ -339,12 +339,13 @@ exit:
 ; of the outer loop as start value. It is sufficient to subtract the start
 ; values (%dst, %src) of the outer AddRecs.
 define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %m, i64 noundef %n) {
+;
 ; CHECK-LABEL: define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(
 ; CHECK-SAME: ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i64 noundef [[M:%.*]], i64 noundef [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; CHECK-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
-; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[DST1]], [[SRC2]]
 ; CHECK-NEXT:    br label %[[OUTER_LOOP:.*]]
 ; CHECK:       [[OUTER_LOOP]]:
 ; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ]
@@ -352,7 +353,7 @@ define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr noca
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
-; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[SUB]], 16
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], [[SCALAR_PH]], [[VECTOR_PH:label %.*]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll
new file mode 100644
index 0000000000000..d3c7c9c14802c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll
@@ -0,0 +1,115 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-tail-folding-style=data-and-control -disable-output %s 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Tests for printing predicated VPlans.
+
+define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: 'alias_mask'
+; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF
+; CHECK-NEXT: vp<%3> = original trip-count
+; CHECK-EMPTY: 
+; CHECK-NEXT: ir-bb<for.body.preheader>:
+; CHECK-NEXT:   IR   %wide.trip.count = zext nneg i32 %n to i64
+; CHECK-NEXT:   EMIT vp<%3> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:   EMIT vp<%4> = EXPAND SCEV (ptrtoint ptr %c to i64)
+; CHECK-NEXT:   EMIT vp<%5> = EXPAND SCEV (ptrtoint ptr %b to i64)
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY: 
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   EMIT vp<%6> = ALIAS-LANE-MASK vp<%5>, vp<%4> (write-after-read)
+; CHECK-NEXT:   EMIT vp<%index.part.next> = VF * Part + ir<0>
+; CHECK-NEXT:   EMIT vp<%active.lane.mask.entry> = active lane mask vp<%index.part.next>, vp<%3>
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY: 
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%7> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     ACTIVE-LANE-MASK-PHI vp<%8> = phi vp<%active.lane.mask.entry>, vp<%active.lane.mask.next>
+; CHECK-NEXT:     EMIT vp<%9> = and vp<%8>, vp<%6>
+; CHECK-NEXT:   Successor(s): pred.store
+; CHECK-EMPTY: 
+; CHECK-NEXT:   <xVFxUF> pred.store: {
+; CHECK-NEXT:     pred.store.entry:
+; CHECK-NEXT:       BRANCH-ON-MASK vp<%9>
+; CHECK-NEXT:     Successor(s): pred.store.if, pred.store.continue
+; CHECK-EMPTY: 
+; CHECK-NEXT:     pred.store.if:
+; CHECK-NEXT:       vp<%10> = SCALAR-STEPS vp<%7>, ir<1>, vp<%0>
+; CHECK-NEXT:       REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%10>
+; CHECK-NEXT:       REPLICATE ir<%0> = load ir<%arrayidx>
+; CHECK-NEXT:       REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<%10>
+; CHECK-NEXT:       REPLICATE ir<%1> = load ir<%arrayidx2>
+; CHECK-NEXT:       REPLICATE ir<%arrayidx6> = getelementptr inbounds ir<%c>, vp<%10>
+; CHECK-NEXT:       REPLICATE ir<%add> = add ir<%1>, ir<%0>
+; CHECK-NEXT:       REPLICATE store ir<%add>, ir<%arrayidx6>
+; CHECK-NEXT:     Successor(s): pred.store.continue
+; CHECK-EMPTY: 
+; CHECK-NEXT:     pred.store.continue:
+; CHECK-NEXT:     No successors
+; CHECK-NEXT:   }
+; CHECK-NEXT:   Successor(s): for.body.2
+; CHECK-EMPTY: 
+; CHECK-NEXT:   for.body.2:
+; CHECK-NEXT:     EMIT vp<%popcount> = popcount vp<%6>
+; CHECK-NEXT:     EMIT vp<%index.next> = add vp<%7>, vp<%popcount>
+; CHECK-NEXT:     EMIT vp<%11> = VF * Part + vp<%index.next>
+; CHECK-NEXT:     EMIT vp<%active.lane.mask.next> = active lane mask vp<%11>, vp<%3>
+; CHECK-NEXT:     EMIT vp<%12> = not vp<%active.lane.mask.next>
+; CHECK-NEXT:     EMIT branch-on-cond vp<%12>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY: 
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>
+; CHECK-EMPTY: 
+; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT-SCALAR vp<%bc.resume.val> = phi [ ir<0>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT: Successor(s): ir-bb<for.body>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.body>:
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:   IR   %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+; CHECK-NEXT:   IR   %0 = load i8, ptr %arrayidx, align 1
+; CHECK-NEXT:   IR   %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+; CHECK-NEXT:   IR   %1 = load i8, ptr %arrayidx2, align 1
+; CHECK-NEXT:   IR   %add = add i8 %1, %0
+; CHECK-NEXT:   IR   %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
+; CHECK-NEXT:   IR   store i8 %add, ptr %arrayidx6, align 1
+; CHECK-NEXT:   IR   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT:   IR   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %add = add i8 %1, %0
+  %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
+  store i8 %add, ptr %arrayidx6, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}

>From 1af806f3197333963d83a34a968f0eaf9db3181b Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 29 Jul 2025 10:48:25 +0100
Subject: [PATCH 02/15] Check popcount of mask before entering vector body

---
 .../llvm/Analysis/TargetTransformInfo.h       |   4 +-
 .../include/llvm/Transforms/Utils/LoopUtils.h |   8 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |   5 +
 llvm/lib/Transforms/Utils/LoopUtils.cpp       |  80 +++++++---
 .../Vectorize/LoopVectorizationPlanner.h      |  24 ++-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 141 +++++++++++-------
 llvm/lib/Transforms/Vectorize/VPlan.h         |   2 +-
 .../Vectorize/VPlanConstruction.cpp           |  30 ++--
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   8 +-
 .../Transforms/Vectorize/VPlanTransforms.h    |   4 +-
 .../LoopVectorize/AArch64/alias_mask.ll       | 113 ++++----------
 .../vplan-printing-predicated.ll              |  18 +--
 12 files changed, 230 insertions(+), 207 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 38a46d18b5b54..f96f2f411b6b6 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -202,8 +202,8 @@ enum class TailFoldingStyle {
 };
 
 enum class RTCheckStyle {
-  /// Branch to scalar loop if checks fails at runtime.
-  ScalarFallback,
+  /// Create runtime checks based on the difference between two pointers
+  ScalarDifference,
   /// Form a mask based on elements which won't be a WAR or RAW hazard.
   UseSafeEltsMask,
 };
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 723f6aea1b76f..e9515c32b0603 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -569,9 +569,11 @@ addRuntimeChecks(Instruction *Loc, Loop *TheLoop,
                  const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,
                  SCEVExpander &Expander, bool HoistRuntimeChecks = false);
 
-LLVM_ABI Value *addDiffRuntimeChecks(
-    Instruction *Loc, ArrayRef<PointerDiffInfo> Checks, SCEVExpander &Expander,
-    function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC);
+LLVM_ABI Value *
+addDiffRuntimeChecks(Instruction *Loc, ArrayRef<PointerDiffInfo> Checks,
+                     SCEVExpander &Expander,
+                     function_ref<Value *(IRBuilderBase &, unsigned)> GetVF,
+                     unsigned IC, ElementCount VF, bool UseSafeEltsMask);
 
 /// Struct to hold information about a partially invariant condition.
 struct IVConditionInfo {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 3042251cf754d..e42484353259f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -999,6 +999,11 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     }
     break;
   }
+  case Intrinsic::loop_dependence_raw_mask:
+  case Intrinsic::loop_dependence_war_mask:
+    if (ST->hasSVE2())
+      return 1;
+    return InstructionCost::getInvalid(CostKind);
   default:
     break;
   }
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 2d3e7f85ee3a8..248f6fa7d7f5e 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -2022,7 +2022,8 @@ Value *llvm::addRuntimeChecks(
 
 Value *llvm::addDiffRuntimeChecks(
     Instruction *Loc, ArrayRef<PointerDiffInfo> Checks, SCEVExpander &Expander,
-    function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC) {
+    function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC,
+    ElementCount VF, bool UseSafeEltsMask) {
 
   LLVMContext &Ctx = Loc->getContext();
   IRBuilder ChkBuilder(Ctx, InstSimplifyFolder(Loc->getDataLayout()));
@@ -2034,33 +2035,68 @@ Value *llvm::addDiffRuntimeChecks(
   // Map to keep track of created compares, The key is the pair of operands for
   // the compare, to allow detecting and re-using redundant compares.
   DenseMap<std::pair<Value *, Value *>, Value *> SeenCompares;
+  Value *AliasLaneMask = nullptr;
   for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze,
                     WriteAfterRead] : Checks) {
     Type *Ty = SinkStart->getType();
-    // Compute VF * IC * AccessSize.
-    auto *VFTimesICTimesSize =
-        ChkBuilder.CreateMul(GetVF(ChkBuilder, Ty->getScalarSizeInBits()),
-                             ConstantInt::get(Ty, IC * AccessSize));
-    Value *Diff =
-        Expander.expandCodeFor(SE.getMinusSCEV(SinkStart, SrcStart), Ty, Loc);
-
-    // Check if the same compare has already been created earlier. In that case,
-    // there is no need to check it again.
-    Value *IsConflict = SeenCompares.lookup({Diff, VFTimesICTimesSize});
-    if (IsConflict)
-      continue;
+    if (!VF.isScalar() && UseSafeEltsMask) {
+      Value *Sink = Expander.expandCodeFor(SinkStart, Ty, Loc);
+      Value *Src = Expander.expandCodeFor(SrcStart, Ty, Loc);
+      unsigned IntOpc = WriteAfterRead ? Intrinsic::loop_dependence_war_mask
+                                       : Intrinsic::loop_dependence_raw_mask;
+      Value *SourceAsPtr = ChkBuilder.CreateCast(Instruction::IntToPtr, Src,
+                                                 ChkBuilder.getPtrTy());
+      Value *SinkAsPtr = ChkBuilder.CreateCast(Instruction::IntToPtr, Sink,
+                                               ChkBuilder.getPtrTy());
+      Value *M = ChkBuilder.CreateIntrinsic(
+          IntOpc, {VectorType::get(ChkBuilder.getInt1Ty(), VF)},
+          {SourceAsPtr, SinkAsPtr, ChkBuilder.getInt64(AccessSize)}, nullptr,
+          "alias.lane.mask");
+      if (AliasLaneMask)
+        M = ChkBuilder.CreateAnd(AliasLaneMask, M);
+      else
+        AliasLaneMask = M;
+    } else {
+      // Compute VF * IC * AccessSize.
+      auto *VFTimesICTimesSize =
+          ChkBuilder.CreateMul(GetVF(ChkBuilder, Ty->getScalarSizeInBits()),
+                               ConstantInt::get(Ty, IC * AccessSize));
+      Value *Diff =
+          Expander.expandCodeFor(SE.getMinusSCEV(SinkStart, SrcStart), Ty, Loc);
+
+      // Check if the same compare has already been created earlier. In that
+      // case, there is no need to check it again.
+      Value *IsConflict = SeenCompares.lookup({Diff, VFTimesICTimesSize});
+      if (IsConflict)
+        continue;
 
-    IsConflict =
-        ChkBuilder.CreateICmpULT(Diff, VFTimesICTimesSize, "diff.check");
-    SeenCompares.insert({{Diff, VFTimesICTimesSize}, IsConflict});
-    if (NeedsFreeze)
-      IsConflict =
-          ChkBuilder.CreateFreeze(IsConflict, IsConflict->getName() + ".fr");
-    if (MemoryRuntimeCheck) {
       IsConflict =
-          ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
+          ChkBuilder.CreateICmpULT(Diff, VFTimesICTimesSize, "diff.check");
+      SeenCompares.insert({{Diff, VFTimesICTimesSize}, IsConflict});
+      if (NeedsFreeze)
+        IsConflict =
+            ChkBuilder.CreateFreeze(IsConflict, IsConflict->getName() + ".fr");
+      if (MemoryRuntimeCheck) {
+        IsConflict =
+            ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
+      }
+      MemoryRuntimeCheck = IsConflict;
     }
-    MemoryRuntimeCheck = IsConflict;
+  }
+
+  if (AliasLaneMask) {
+    auto *VecVT = VectorType::get(ChkBuilder.getInt1Ty(), VF);
+    // Extend to an i8 since i1 is too small to add with
+    Value *PopCount = ChkBuilder.CreateCast(
+        Instruction::ZExt, AliasLaneMask,
+        VectorType::get(ChkBuilder.getInt8Ty(), VecVT->getElementCount()));
+
+    PopCount =
+        ChkBuilder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, PopCount);
+    PopCount = ChkBuilder.CreateCast(Instruction::ZExt, PopCount,
+                                     ChkBuilder.getInt64Ty());
+    MemoryRuntimeCheck = ChkBuilder.CreateICmpUGT(
+        PopCount, ConstantInt::get(ChkBuilder.getInt64Ty(), 0));
   }
 
   return MemoryRuntimeCheck;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index b5d9d6e4b7837..4911d2b69e8f8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -494,12 +494,10 @@ class LoopVectorizationPlanner {
   /// non-zero or all applicable candidate VFs otherwise. If vectorization and
   /// interleaving should be avoided up-front, no plans are generated.
   /// DiffChecks is a list of pointer pairs that should be checked for aliasing,
-  /// setting HasAliasMask to true in the case that an alias mask is generated
-  /// and the vector loop should be entered even if the pointers alias across a
-  /// loop iteration.
+  /// combining the resulting predicate with an active lane mask if one is in
+  /// use.
   void plan(ElementCount UserVF, unsigned UserIC,
-            std::optional<ArrayRef<PointerDiffInfo>> DiffChecks,
-            bool &HasAliasMask);
+            std::optional<ArrayRef<PointerDiffInfo>> DiffChecks);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
@@ -585,22 +583,20 @@ class LoopVectorizationPlanner {
   /// built. Each VPlan is built starting from a copy of \p InitialPlan, which
   /// is a plain CFG VPlan wrapping the original scalar loop.
   /// RTChecks is a list of pointer pairs that should be checked for aliasing,
-  /// setting HasAliasMask to true in the case that an alias mask is generated
-  /// and the vector loop should be entered even if the pointers alias across a
-  /// loop iteration.
+  /// combining the resulting predicate with an active lane mask if one is in
+  /// use.
   VPlanPtr tryToBuildVPlanWithVPRecipes(VPlanPtr InitialPlan, VFRange &Range,
                                         LoopVersioning *LVer,
-                                        ArrayRef<PointerDiffInfo> RTChecks,
-                                        bool &HasAliasMask);
+                                        ArrayRef<PointerDiffInfo> RTChecks);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
-  /// RTChecks contains a list of pointer pairs that an alias mask should be
-  /// generated for.
+  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
+  /// combining the resulting predicate with an active lane mask if one is in
+  /// use.
   void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF,
-                                ArrayRef<PointerDiffInfo> RTChecks,
-                                bool &HasAliasMask);
+                                ArrayRef<PointerDiffInfo> RTChecks);
 
   // Adjust the recipes for reductions. For in-loop reductions the chain of
   // instructions leading from the loop exit instr to the phi need to be
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6900dbb217d07..adf4fda660461 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -567,11 +567,7 @@ class InnerLoopVectorizer {
   /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
   /// vector preheader and its predecessor, also connecting the new block to the
   /// scalar preheader.
-  /// If HasAliasMask is true then the vector loop will be branched to
-  /// unconditionally, instead of there being a conditional branch to the scalar
-  /// loop or vector loop
-  void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB,
-                                  bool HasAliasMask = false);
+  void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
 
   /// The original loop.
   Loop *OrigLoop;
@@ -1355,7 +1351,7 @@ class LoopVectorizationCostModel {
     case TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck:
       return RTCheckStyle::UseSafeEltsMask;
     default:
-      return RTCheckStyle::ScalarFallback;
+      return RTCheckStyle::ScalarDifference;
     }
   }
 
@@ -1818,10 +1814,6 @@ class GeneratedRTChecks {
   TTI::TargetCostKind CostKind;
 
 public:
-  /// Set by VPlan when the vector loop should be entered even when runtime
-  /// checks determine that pointers alias within an iteration.
-  bool HasAliasMask = false;
-
   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
                     LoopInfo *LI, TargetTransformInfo *TTI,
                     const DataLayout &DL, TTI::TargetCostKind CostKind)
@@ -1835,7 +1827,8 @@ class GeneratedRTChecks {
   /// there is no vector code generation, the check blocks are removed
   /// completely.
   void create(Loop *L, const LoopAccessInfo &LAI,
-              const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
+              const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC,
+              bool UseSafeEltsMask) {
 
     // Hard cutoff to limit compile-time increase in case a very large number of
     // runtime checks needs to be generated.
@@ -1883,7 +1876,7 @@ class GeneratedRTChecks {
                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
               return RuntimeVF;
             },
-            IC);
+            IC, VF, UseSafeEltsMask);
       } else {
         MemRuntimeCheckCond = addRuntimeChecks(
             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
@@ -2296,7 +2289,57 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
   return TTI.enableMaskedInterleavedAccessVectorization();
 }
 
-void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB, bool HasAliasMask) {
+Value *
+InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
+  if (VectorTripCount)
+    return VectorTripCount;
+
+  Value *TC = getTripCount();
+  IRBuilder<> Builder(InsertBlock->getTerminator());
+
+  Type *Ty = TC->getType();
+  // This is where we can make the step a runtime constant.
+  Value *Step = createStepForVF(Builder, Ty, VF, UF);
+
+  // If the tail is to be folded by masking, round the number of iterations N
+  // up to a multiple of Step instead of rounding down. This is done by first
+  // adding Step-1 and then rounding down. Note that it's ok if this addition
+  // overflows: the vector induction variable will eventually wrap to zero given
+  // that it starts at zero and its Step is a power of two; the loop will then
+  // exit, with the last early-exit vector comparison also producing all-true.
+  // For scalable vectors the VF is not guaranteed to be a power of 2, but this
+  // is accounted for in emitIterationCountCheck that adds an overflow check.
+  if (Cost->foldTailByMasking()) {
+    assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
+           "VF*UF must be a power of 2 when folding tail by masking");
+    TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
+                           "n.rnd.up");
+  }
+
+  // Now we need to generate the expression for the part of the loop that the
+  // vectorized body will execute. This is equal to N - (N % Step) if scalar
+  // iterations are not required for correctness, or N - Step, otherwise. Step
+  // is equal to the vectorization factor (number of SIMD elements) times the
+  // unroll factor (number of SIMD instructions).
+  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
+
+  // There are cases where we *must* run at least one iteration in the remainder
+  // loop.  See the cost model for when this can happen.  If the step evenly
+  // divides the trip count, we set the remainder to be equal to the step. If
+  // the step does not evenly divide the trip count, no adjustment is necessary
+  // since there will already be scalar iterations. Note that the minimum
+  // iterations check ensures that N >= Step.
+  if (Cost->requiresScalarEpilogue(VF.isVector())) {
+    auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
+    R = Builder.CreateSelect(IsZero, Step, R);
+  }
+
+  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
+
+  return VectorTripCount;
+}
+
+void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
   // Note: The block with the minimum trip-count check is already connected
   // during earlier VPlan construction.
   VPBlockBase *ScalarPH = Plan.getScalarPreheader();
@@ -2306,19 +2349,17 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB, bool
   VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
   VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPBB, CheckVPIRBB);
   PreVectorPH = CheckVPIRBB;
-  if (!HasAliasMask) {
-      VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
-      PreVectorPH->swapSuccessors();
-
-      // We just connected a new block to the scalar preheader. Update all
-      // VPPhis by adding an incoming value for it, replicating the last value.
-      unsigned NumPredecessors = ScalarPH->getNumPredecessors();
-      for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
-        assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
-        assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
-               "must have incoming values for all operands");
-        R.addOperand(R.getOperand(NumPredecessors - 2));
-      }
+  VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
+  PreVectorPH->swapSuccessors();
+
+  // We just connected a new block to the scalar preheader. Update all
+  // VPPhis by adding an incoming value for it, replicating the last value.
+  unsigned NumPredecessors = ScalarPH->getNumPredecessors();
+  for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
+    assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
+    assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
+           "must have incoming values for all operands");
+    R.addOperand(R.getOperand(NumPredecessors - 2));
   }
 }
 
@@ -6725,7 +6766,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
 
 void LoopVectorizationPlanner::plan(
     ElementCount UserVF, unsigned UserIC,
-    std::optional<ArrayRef<PointerDiffInfo>> RTChecks, bool &HasAliasMask) {
+    std::optional<ArrayRef<PointerDiffInfo>> RTChecks) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
@@ -6772,7 +6813,7 @@ void LoopVectorizationPlanner::plan(
       CM.collectInLoopReductions();
       if (CM.selectUserVectorizationFactor(UserVF)) {
         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-        buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecks, HasAliasMask);
+        buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecks);
         LLVM_DEBUG(printPlans(dbgs()));
         return;
       }
@@ -6797,9 +6838,9 @@ void LoopVectorizationPlanner::plan(
   }
 
   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
-                           DiffChecks, HasAliasMask);
+                           DiffChecks);
   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
-                           DiffChecks, HasAliasMask);
+                           DiffChecks);
 
   LLVM_DEBUG(printPlans(dbgs()));
 }
@@ -8406,9 +8447,9 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
                                       ScaleFactor, Reduction);
 }
 
-void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
-    ElementCount MaxVF, ArrayRef<PointerDiffInfo> RTChecks,
-    bool &HasAliasMask) {
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
+    ElementCount MinVF, ElementCount MaxVF,
+    ArrayRef<PointerDiffInfo> DiffChecks) {
   if (ElementCount::isKnownGT(MinVF, MaxVF))
     return;
 
@@ -8434,7 +8475,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
     if (auto Plan = tryToBuildVPlanWithVPRecipes(
-            std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer, RTChecks, HasAliasMask)) {
+            std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer,
+            DiffChecks)) {
       bool HasScalarVF = Plan->hasScalarVFOnly();
       // Now optimize the initial VPlan.
       if (!HasScalarVF)
@@ -8659,7 +8701,8 @@ static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
 }
 
 VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
-    VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer, ArrayRef<PointerDiffInfo> RTChecks, bool &HasAliasMask) {
+    VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer,
+    ArrayRef<PointerDiffInfo> DiffChecks) {
 
   using namespace llvm::VPlanPatternMatch;
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
@@ -8948,9 +8991,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     bool WithoutRuntimeCheck =
         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
-                                       WithoutRuntimeCheck, PSE, RTChecks);
-    if (ForControlFlow && !RTChecks.empty())
-      HasAliasMask = true;
+                                       WithoutRuntimeCheck, PSE, DiffChecks);
   }
   VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues);
 
@@ -9366,7 +9407,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
             CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
            "Cannot SCEV check stride or overflow when optimizing for size");
     VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
-                                      HasBranchWeights, RTChecks.HasAliasMask);
+                                      HasBranchWeights);
   }
   const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
   if (MemCheckBlock) {
@@ -9391,7 +9432,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
       });
     }
     VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock,
-                                      HasBranchWeights, RTChecks.HasAliasMask);
+                                      HasBranchWeights);
   }
 }
 
@@ -9522,7 +9563,6 @@ static bool processLoopInVPlanNativePath(
   // Mark the loop as already vectorized to avoid vectorizing again.
   Hints.setAlreadyVectorized();
   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
-
   return true;
 }
 
@@ -10137,30 +10177,31 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   ElementCount UserVF = Hints.getWidth();
   unsigned UserIC = Hints.getInterleave();
 
-  GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
-                           CM.CostKind);
-
   // Plan how to best vectorize.
   LVP.plan(UserVF, UserIC,
-           LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(),
-           Checks.HasAliasMask);
+           LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks());
   VectorizationFactor VF = LVP.computeBestVF();
-  if (Checks.HasAliasMask)
-    LoopsAliasMasked++;
   unsigned IC = 1;
 
   if (ORE->allowExtraAnalysis(LV_NAME))
     LVP.emitInvalidCostRemarks(ORE);
 
+  GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
   if (LVP.hasPlanWithVF(VF.Width)) {
     // Select the interleave count.
     IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
 
     unsigned SelectedIC = std::max(IC, UserIC);
-    //  Optimistically generate runtime checks if they are needed. Drop them if
+    // Optimistically generate runtime checks if they are needed. Drop them if
     //  they turn out to not be profitable.
     if (VF.Width.isVector() || SelectedIC > 1) {
-      Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
+      TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
+      bool UseSafeEltsMask =
+          useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle));
+      if (UseSafeEltsMask)
+        LoopsAliasMasked++;
+      Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC,
+                    UseSafeEltsMask);
 
       // Bail out early if either the SCEV or memory runtime checks are known to
       // fail. In that case, the vector loop would never execute.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index fbf12c9483641..c96f677d3b127 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3238,7 +3238,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   }
 };
 
-// Given a pointer A that is being laoded from, and pointer B that is being
+// Given a pointer A that is being loaded from, and pointer B that is being
 // stored to, both with unknown lengths, create a mask that disables
 // elements which could overlap across a loop iteration. For example, if A
 // is X and B is X + 2 with VF being 4, only the first two elements of the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 4d8080df02930..b231a8429503f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -638,26 +638,24 @@ static constexpr uint32_t CheckBypassWeights[] = {1, 127};
 
 void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
                                        BasicBlock *CheckBlock,
-                                       bool AddBranchWeights, bool HasAliasMask) {
+                                       bool AddBranchWeights) {
+  VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
   VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock);
   VPBlockBase *VectorPH = Plan.getVectorPreheader();
+  VPBlockBase *ScalarPH = Plan.getScalarPreheader();
   VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
   VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckBlockVPBB);
-  VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
-  if (!HasAliasMask) {
-    VPBlockBase *ScalarPH = Plan.getScalarPreheader();
-    VPBlockUtils::connectBlocks(CheckBlockVPBB, ScalarPH);
-    CheckBlockVPBB->swapSuccessors();
-
-    // We just connected a new block to the scalar preheader. Update all
-    // VPPhis by adding an incoming value for it, replicating the last value.
-    unsigned NumPredecessors = ScalarPH->getNumPredecessors();
-    for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
-      assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
-      assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
-             "must have incoming values for all operands");
-      R.addOperand(R.getOperand(NumPredecessors - 2));
-    }
+  VPBlockUtils::connectBlocks(CheckBlockVPBB, ScalarPH);
+  CheckBlockVPBB->swapSuccessors();
+
+  // We just connected a new block to the scalar preheader. Update all
+  // VPPhis by adding an incoming value for it, replicating the last value.
+  unsigned NumPredecessors = ScalarPH->getNumPredecessors();
+  for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
+    assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
+    assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
+           "must have incoming values for all operands");
+    R.addOperand(R.getOperand(NumPredecessors - 2));
   }
 
   VPIRMetadata VPBranchWeights;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 4be54a8eaf15b..e3d30b0ded162 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2053,11 +2053,15 @@ static VPValue *addVPLaneMaskPhiAndUpdateExitBranch(
   LaneMaskPhi->insertAfter(CanonicalIVPHI);
   VPValue *LaneMask = LaneMaskPhi;
   if (AliasMask) {
+    auto &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
+    VPBuilder CountBuilder =
+        VPBuilder::getToInsertAfter(AliasMask->getDefiningRecipe());
+    VPValue *PopCount = CountBuilder.createNaryOp(VPInstruction::PopCount,
+                                                  {AliasMask}, DL, "popcount");
     // Increment phi by correct amount.
     Builder.setInsertPoint(CanonicalIVIncrement);
 
-    VPValue *IncrementBy = Builder.createNaryOp(VPInstruction::PopCount,
-                                                {AliasMask}, DL, "popcount");
+    VPValue *IncrementBy = PopCount;
     Type *IVType = CanonicalIVPHI->getScalarType();
 
     if (IVType->getScalarSizeInBits() < 64)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index a23ab1c09d253..83df2144822d5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -89,9 +89,9 @@ struct VPlanTransforms {
 
   /// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
   /// VPValue and connect the block to \p Plan, using the VPValue as branch
-  /// condition. If an alias mask has been set up then the checkblock won't branch to the scalar preheader.
+  /// condition.
   static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock,
-                               bool AddBranchWeights, bool HasAliasMask);
+                               bool AddBranchWeights);
 
   /// Replaces the VPInstructions in \p Plan with corresponding
   /// widen recipes. Returns false if any VPInstructions could not be converted
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
index 4f1696b7d7e1f..acd6beb303954 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -1,47 +1,25 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --filter-out-after "^scalar.ph:" --version 4
-; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize,instcombine,early-cse -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s
 
 define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-LABEL: define dso_local void @alias_mask(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C]] to i64
-; CHECK-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[C2:%.*]] = ptrtoint ptr [[C]] to i64
 ; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i64 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[C2]], [[B3]]
-; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[B]], ptr [[C]], i64 1)
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP0]])
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP7]], 16
-; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP6]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP5]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[B2]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[C1]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT6]], i32 0
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT]], i32 0
-; CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-; CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP14]] to ptr
-; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP30]], ptr [[TMP31]], i64 1)
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP8]], 16
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[TMP15]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt i64 [[N]], [[TMP15]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i64 [[TMP11]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP8]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP4]])
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
@@ -49,23 +27,16 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP25:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP25]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP25]], <vscale x 16 x i8> poison)
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP25]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP25]], <vscale x 16 x i8> poison)
 ; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD5]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP20]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[TMP25]])
-; CHECK-NEXT:    [[TMP28:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i8>
-; CHECK-NEXT:    [[TMP23:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP28]])
-; CHECK-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP23]] to i64
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP20]], ptr [[TMP21]], i32 1, <vscale x 16 x i1> [[TMP25]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP24]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP13]])
-; CHECK-NEXT:    [[TMP26:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <vscale x 16 x i1> [[TMP26]], i32 0
-; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ;
 entry:
@@ -93,44 +64,21 @@ define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-LABEL: define i32 @alias_mask_read_after_write(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[C2:%.*]] = ptrtoint ptr [[C]] to i64
-; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
-; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[CMP19:%.*]] = icmp sgt i64 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[B2]], [[C3]]
-; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.raw.mask.nxv4i1(ptr [[C]], ptr [[B]], i64 4)
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 4 x i1> [[ALIAS_LANE_MASK]] to <vscale x 4 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP0]])
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP8]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP7]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[C2]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[B1]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT6]], i32 0
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP28]] to ptr
-; CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP15]] to ptr
-; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.raw.mask.nxv4i1(ptr [[TMP34]], ptr [[TMP35]], i64 4)
+; CHECK-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP9]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[N]], [[TMP16]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP16]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP4]])
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
@@ -139,25 +87,18 @@ define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP31:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 2, <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP17]], i32 2, <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP20]], i32 2, <vscale x 4 x i1> [[TMP31]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP19]], i32 2, <vscale x 4 x i1> [[TMP31]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP22]], i32 2, <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP21]], i32 2, <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP23:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 4 x i32> [[TMP23]], [[WIDE_MASKED_LOAD5]]
 ; CHECK-NEXT:    [[TMP25]] = select <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP32:%.*]] = zext <vscale x 4 x i1> [[ALIAS_LANE_MASK]] to <vscale x 4 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP32]])
-; CHECK-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP26]] to i64
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP27]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
-; CHECK-NEXT:    [[TMP29:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 4 x i1> [[TMP29]], i32 0
-; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP13]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll
index d3c7c9c14802c..e4c142c73b8ec 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll
@@ -10,32 +10,32 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
 ; CHECK-NEXT: Live-in vp<%0> = VF
 ; CHECK-NEXT: vp<%3> = original trip-count
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body.preheader>:
 ; CHECK-NEXT:   IR   %wide.trip.count = zext nneg i32 %n to i64
 ; CHECK-NEXT:   EMIT vp<%3> = EXPAND SCEV (zext i32 %n to i64)
 ; CHECK-NEXT:   EMIT vp<%4> = EXPAND SCEV (ptrtoint ptr %c to i64)
 ; CHECK-NEXT:   EMIT vp<%5> = EXPAND SCEV (ptrtoint ptr %b to i64)
 ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT:   EMIT vp<%6> = ALIAS-LANE-MASK vp<%5>, vp<%4> (write-after-read)
 ; CHECK-NEXT:   EMIT vp<%index.part.next> = VF * Part + ir<0>
 ; CHECK-NEXT:   EMIT vp<%active.lane.mask.entry> = active lane mask vp<%index.part.next>, vp<%3>
 ; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT:   vector.body:
 ; CHECK-NEXT:     EMIT vp<%7> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
 ; CHECK-NEXT:     ACTIVE-LANE-MASK-PHI vp<%8> = phi vp<%active.lane.mask.entry>, vp<%active.lane.mask.next>
 ; CHECK-NEXT:     EMIT vp<%9> = and vp<%8>, vp<%6>
 ; CHECK-NEXT:   Successor(s): pred.store
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT:   <xVFxUF> pred.store: {
 ; CHECK-NEXT:     pred.store.entry:
 ; CHECK-NEXT:       BRANCH-ON-MASK vp<%9>
 ; CHECK-NEXT:     Successor(s): pred.store.if, pred.store.continue
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT:     pred.store.if:
 ; CHECK-NEXT:       vp<%10> = SCALAR-STEPS vp<%7>, ir<1>, vp<%0>
 ; CHECK-NEXT:       REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%10>
@@ -46,12 +46,12 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NEXT:       REPLICATE ir<%add> = add ir<%1>, ir<%0>
 ; CHECK-NEXT:       REPLICATE store ir<%add>, ir<%arrayidx6>
 ; CHECK-NEXT:     Successor(s): pred.store.continue
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT:     pred.store.continue:
 ; CHECK-NEXT:     No successors
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Successor(s): for.body.2
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT:   for.body.2:
 ; CHECK-NEXT:     EMIT vp<%popcount> = popcount vp<%6>
 ; CHECK-NEXT:     EMIT vp<%index.next> = add vp<%7>, vp<%popcount>
@@ -62,10 +62,10 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
 ; CHECK-NEXT: Successor(s): middle.block
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>
-; CHECK-EMPTY: 
+; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:

>From 7d5ce374830189f53dc8b0b016552dbbfcb44bd5 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 6 Aug 2025 17:04:52 +0100
Subject: [PATCH 03/15] Add target hook

---
 .../llvm/Analysis/TargetTransformInfo.h       |  5 ++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  2 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  4 ++
 .../AArch64/AArch64TargetTransformInfo.cpp    |  5 ++
 .../AArch64/AArch64TargetTransformInfo.h      |  2 +
 .../Transforms/Vectorize/LoopVectorize.cpp    | 60 ++-----------------
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  3 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  5 +-
 .../AArch64/induction-costs-sve.ll            | 26 ++------
 .../vplan-printing-predicated.ll              | 52 +++++++---------
 10 files changed, 53 insertions(+), 111 deletions(-)
 rename llvm/test/Transforms/LoopVectorize/{ => AArch64}/vplan-printing-predicated.ll (68%)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f96f2f411b6b6..df18f223b58e5 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1354,6 +1354,11 @@ class TargetTransformInfo {
       PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
       TTI::TargetCostKind CostKind) const;
 
+  /// \return true if a mask should be formed that disables lanes that could
+  /// alias between two pointers. The mask is created by the
+  /// loop_dependence_{war,raw}_mask intrinsics.
+  LLVM_ABI bool useSafeEltsMask(ElementCount VF) const;
+
   /// \return The maximum interleave factor that any transform should try to
   /// perform for this target. This number depends on the level of parallelism
   /// and the number of execution units in the CPU.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 200cbafbaa6e2..e2ccf477fad4b 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -660,6 +660,8 @@ class TargetTransformInfoImplBase {
     return InstructionCost::getInvalid();
   }
 
+  virtual bool useSafeEltsMask(ElementCount VF) const { return false; }
+
   virtual unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
 
   virtual InstructionCost getArithmeticInstrCost(
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 3141060a710ce..e7a47d608d30a 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -878,6 +878,10 @@ InstructionCost TargetTransformInfo::getPartialReductionCost(
                                           BinOp, CostKind);
 }
 
+bool TargetTransformInfo::useSafeEltsMask(ElementCount VF) const {
+  return TTIImpl->useSafeEltsMask(VF);
+}
+
 unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
   return TTIImpl->getMaxInterleaveFactor(VF);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e42484353259f..551c2135be15f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5598,6 +5598,11 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
   return Cost;
 }
 
+bool AArch64TTIImpl::useSafeEltsMask(ElementCount VF) const {
+  // The whilewr/rw instructions require SVE2
+  return ST->hasSVE2();
+}
+
 InstructionCost
 AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
                                VectorType *SrcTy, ArrayRef<int> Mask,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 9c96fdd427814..0ba739a86b77c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -388,6 +388,8 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
       TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
       TTI::TargetCostKind CostKind) const override;
 
+  bool useSafeEltsMask(ElementCount VF) const override;
+
   bool enableOrderedReductions() const override { return true; }
 
   InstructionCost getInterleavedMemoryOpCost(
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index adf4fda660461..649fa3e3f8e24 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2076,8 +2076,10 @@ static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
 }
 
-static bool useSafeEltsMask(TailFoldingStyle TFStyle, RTCheckStyle Style) {
-  return useActiveLaneMask(TFStyle) && Style == RTCheckStyle::UseSafeEltsMask;
+static bool useSafeEltsMask(TailFoldingStyle TFStyle, RTCheckStyle Style,
+                            ElementCount VF, const TargetTransformInfo &TTI) {
+  return useActiveLaneMask(TFStyle) && Style == RTCheckStyle::UseSafeEltsMask &&
+         TTI.useSafeEltsMask(VF);
 }
 
 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
@@ -2289,56 +2291,6 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
   return TTI.enableMaskedInterleavedAccessVectorization();
 }
 
-Value *
-InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
-  if (VectorTripCount)
-    return VectorTripCount;
-
-  Value *TC = getTripCount();
-  IRBuilder<> Builder(InsertBlock->getTerminator());
-
-  Type *Ty = TC->getType();
-  // This is where we can make the step a runtime constant.
-  Value *Step = createStepForVF(Builder, Ty, VF, UF);
-
-  // If the tail is to be folded by masking, round the number of iterations N
-  // up to a multiple of Step instead of rounding down. This is done by first
-  // adding Step-1 and then rounding down. Note that it's ok if this addition
-  // overflows: the vector induction variable will eventually wrap to zero given
-  // that it starts at zero and its Step is a power of two; the loop will then
-  // exit, with the last early-exit vector comparison also producing all-true.
-  // For scalable vectors the VF is not guaranteed to be a power of 2, but this
-  // is accounted for in emitIterationCountCheck that adds an overflow check.
-  if (Cost->foldTailByMasking()) {
-    assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
-           "VF*UF must be a power of 2 when folding tail by masking");
-    TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
-                           "n.rnd.up");
-  }
-
-  // Now we need to generate the expression for the part of the loop that the
-  // vectorized body will execute. This is equal to N - (N % Step) if scalar
-  // iterations are not required for correctness, or N - Step, otherwise. Step
-  // is equal to the vectorization factor (number of SIMD elements) times the
-  // unroll factor (number of SIMD instructions).
-  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
-
-  // There are cases where we *must* run at least one iteration in the remainder
-  // loop.  See the cost model for when this can happen.  If the step evenly
-  // divides the trip count, we set the remainder to be equal to the step. If
-  // the step does not evenly divide the trip count, no adjustment is necessary
-  // since there will already be scalar iterations. Note that the minimum
-  // iterations check ensures that N >= Step.
-  if (Cost->requiresScalarEpilogue(VF.isVector())) {
-    auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
-    R = Builder.CreateSelect(IsZero, Step, R);
-  }
-
-  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
-
-  return VectorTripCount;
-}
-
 void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
   // Note: The block with the minimum trip-count check is already connected
   // during earlier VPlan construction.
@@ -6778,7 +6730,7 @@ void LoopVectorizationPlanner::plan(
   ArrayRef<PointerDiffInfo> DiffChecks;
   auto TFStyle = CM.getTailFoldingStyle();
   if (RTChecks.has_value() &&
-      useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle)))
+      useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle), UserVF, TTI))
     DiffChecks = *RTChecks;
 
   // Invalidate interleave groups if all blocks of loop will be predicated.
@@ -10197,7 +10149,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     if (VF.Width.isVector() || SelectedIC > 1) {
       TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
       bool UseSafeEltsMask =
-          useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle));
+          useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle), VF.Width, *TTI);
       if (UseSafeEltsMask)
         LoopsAliasMasked++;
       Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 0861a34ac3e42..1ad2265a6268f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -310,7 +310,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
           })
           .Case<VPExpressionRecipe>([this](const auto *R) {
             return inferScalarType(R->getOperandOfResultType());
-          }).Case<VPAliasLaneMaskRecipe>([this](const VPAliasLaneMaskRecipe *R) {
+          })
+          .Case<VPAliasLaneMaskRecipe>([this](const VPAliasLaneMaskRecipe *R) {
             return Type::getInt1Ty(Ctx);
           });
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index a2d758bbd7d7c..3c4cc59b2d4a0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3780,9 +3780,8 @@ void VPAliasLaneMaskRecipe::execute(VPTransformState &State) {
   Value *SinkValue = State.get(getSinkValue(), true);
   Value *SourceValue = State.get(getSourceValue(), true);
 
-  unsigned IntrinsicID = WriteAfterRead
-                             ? Intrinsic::loop_dependence_war_mask
-                             : Intrinsic::loop_dependence_raw_mask;
+  unsigned IntrinsicID = WriteAfterRead ? Intrinsic::loop_dependence_war_mask
+                                        : Intrinsic::loop_dependence_raw_mask;
   Value *SourceAsPtr = Builder.CreateCast(Instruction::IntToPtr, SourceValue,
                                           Builder.getPtrTy());
   Value *SinkAsPtr =
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index a034661432790..bf0c0ee0656a8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -90,8 +90,6 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-LABEL: define void @iv_casts(
 ; PRED-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; PRED-NEXT:  [[ENTRY:.*]]:
-; PRED-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
-; PRED-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
 ; PRED-NEXT:    [[SRC3:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; PRED-NEXT:    [[DST2:%.*]] = ptrtoint ptr [[DST]] to i64
 ; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
@@ -101,7 +99,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16
 ; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST2]], [[SRC3]]
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
-; PRED-NEXT:    br label %[[VECTOR_PH:.*]]
+; PRED-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
 ; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
@@ -111,17 +109,8 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[SRC2]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[DST1]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[X]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PRED-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT6]], i32 0
-; PRED-NEXT:    [[TMP33:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT1]], i32 0
-; PRED-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
-; PRED-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP25]] to ptr
-; PRED-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP34]], ptr [[TMP35]], i64 1)
 ; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 4
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
@@ -132,8 +121,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; PRED:       [[VECTOR_BODY]]:
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK1]], [[ALIAS_LANE_MASK]]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
 ; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; PRED-NEXT:    [[TMP17:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
@@ -143,12 +131,8 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP21:%.*]] = lshr <vscale x 16 x i16> [[TMP20]], splat (i16 1)
 ; PRED-NEXT:    [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP21]] to <vscale x 16 x i8>
 ; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; PRED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
-; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP27]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT:    [[TMP30:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i8>
-; PRED-NEXT:    [[TMP31:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP30]])
-; PRED-NEXT:    [[TMP32:%.*]] = zext i8 [[TMP31]] to i64
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP32]]
+; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP26]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]])
 ; PRED-NEXT:    [[TMP28:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
 ; PRED-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 16 x i1> [[TMP28]], i32 0
@@ -156,7 +140,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
 ; PRED:       [[SCALAR_PH]]:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; PRED-NEXT:    br label %[[LOOP:.*]]
 ; PRED:       [[LOOP]]:
 ; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing-predicated.ll
similarity index 68%
rename from llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll
rename to llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing-predicated.ll
index e4c142c73b8ec..8887ef25437a4 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-predicated.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing-predicated.ll
@@ -1,7 +1,8 @@
 ; REQUIRES: asserts
-; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-tail-folding-style=data-and-control -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -mattr=+sve2 -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-tail-folding-style=data-and-control -disable-output %s 2>&1 | FileCheck %s
 
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
 
 ; Tests for printing predicated VPlans.
 
@@ -20,6 +21,7 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT:   EMIT vp<%6> = ALIAS-LANE-MASK vp<%5>, vp<%4> (write-after-read)
+; CHECK-NEXT:   EMIT vp<%popcount> = popcount vp<%6>
 ; CHECK-NEXT:   EMIT vp<%index.part.next> = VF * Part + ir<0>
 ; CHECK-NEXT:   EMIT vp<%active.lane.mask.entry> = active lane mask vp<%index.part.next>, vp<%3>
 ; CHECK-NEXT: Successor(s): vector loop
@@ -28,37 +30,23 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
 ; CHECK-NEXT:   vector.body:
 ; CHECK-NEXT:     EMIT vp<%7> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
 ; CHECK-NEXT:     ACTIVE-LANE-MASK-PHI vp<%8> = phi vp<%active.lane.mask.entry>, vp<%active.lane.mask.next>
-; CHECK-NEXT:     EMIT vp<%9> = and vp<%8>, vp<%6>
-; CHECK-NEXT:   Successor(s): pred.store
-; CHECK-EMPTY:
-; CHECK-NEXT:   <xVFxUF> pred.store: {
-; CHECK-NEXT:     pred.store.entry:
-; CHECK-NEXT:       BRANCH-ON-MASK vp<%9>
-; CHECK-NEXT:     Successor(s): pred.store.if, pred.store.continue
-; CHECK-EMPTY:
-; CHECK-NEXT:     pred.store.if:
-; CHECK-NEXT:       vp<%10> = SCALAR-STEPS vp<%7>, ir<1>, vp<%0>
-; CHECK-NEXT:       REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%10>
-; CHECK-NEXT:       REPLICATE ir<%0> = load ir<%arrayidx>
-; CHECK-NEXT:       REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<%10>
-; CHECK-NEXT:       REPLICATE ir<%1> = load ir<%arrayidx2>
-; CHECK-NEXT:       REPLICATE ir<%arrayidx6> = getelementptr inbounds ir<%c>, vp<%10>
-; CHECK-NEXT:       REPLICATE ir<%add> = add ir<%1>, ir<%0>
-; CHECK-NEXT:       REPLICATE store ir<%add>, ir<%arrayidx6>
-; CHECK-NEXT:     Successor(s): pred.store.continue
-; CHECK-EMPTY:
-; CHECK-NEXT:     pred.store.continue:
-; CHECK-NEXT:     No successors
-; CHECK-NEXT:   }
-; CHECK-NEXT:   Successor(s): for.body.2
-; CHECK-EMPTY:
-; CHECK-NEXT:   for.body.2:
-; CHECK-NEXT:     EMIT vp<%popcount> = popcount vp<%6>
+; CHECK-NEXT:     vp<%9> = SCALAR-STEPS vp<%7>, ir<1>, vp<%0>
+; CHECK-NEXT:     EMIT vp<%10> = and vp<%8>, vp<%6>
+; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%9>
+; CHECK-NEXT:     vp<%11> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN ir<%0> = load vp<%11>, vp<%10>
+; CHECK-NEXT:     CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<%9>
+; CHECK-NEXT:     vp<%12> = vector-pointer ir<%arrayidx2>
+; CHECK-NEXT:     WIDEN ir<%1> = load vp<%12>, vp<%10>
+; CHECK-NEXT:     WIDEN ir<%add> = add ir<%1>, ir<%0>
+; CHECK-NEXT:     CLONE ir<%arrayidx6> = getelementptr inbounds ir<%c>, vp<%9>
+; CHECK-NEXT:     vp<%13> = vector-pointer ir<%arrayidx6>
+; CHECK-NEXT:     WIDEN store vp<%13>, ir<%add>, vp<%10>
 ; CHECK-NEXT:     EMIT vp<%index.next> = add vp<%7>, vp<%popcount>
-; CHECK-NEXT:     EMIT vp<%11> = VF * Part + vp<%index.next>
-; CHECK-NEXT:     EMIT vp<%active.lane.mask.next> = active lane mask vp<%11>, vp<%3>
-; CHECK-NEXT:     EMIT vp<%12> = not vp<%active.lane.mask.next>
-; CHECK-NEXT:     EMIT branch-on-cond vp<%12>
+; CHECK-NEXT:     EMIT vp<%14> = VF * Part + vp<%index.next>
+; CHECK-NEXT:     EMIT vp<%active.lane.mask.next> = active lane mask vp<%14>, vp<%3>
+; CHECK-NEXT:     EMIT vp<%15> = not vp<%active.lane.mask.next>
+; CHECK-NEXT:     EMIT branch-on-cond vp<%15>
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
 ; CHECK-NEXT: Successor(s): middle.block

>From ab05f4650bee2557da48c0af198f826cd2543f63 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 13 Aug 2025 16:23:51 +0100
Subject: [PATCH 04/15] Add computeCost

---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  3 ++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  9 +++++
 .../AArch64/induction-costs-sve.ll            | 37 +++++--------------
 3 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c96f677d3b127..a4d5cfc164a84 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3277,6 +3277,9 @@ class VPAliasLaneMaskRecipe : public VPSingleDefRecipe {
 
   bool isWriteAfterRead() const { return WriteAfterRead; }
 
+InstructionCost computeCost(ElementCount VF,
+                                             VPCostContext &Ctx) const override;
+
 private:
   unsigned ElementSize;
   bool WriteAfterRead;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3c4cc59b2d4a0..bcc8b19913225 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3793,6 +3793,15 @@ void VPAliasLaneMaskRecipe::execute(VPTransformState &State) {
   State.set(this, AliasMask, /*IsScalar=*/false);
 }
 
+InstructionCost
+VPAliasLaneMaskRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const {
+  Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
+  Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
+  IntrinsicCostAttributes Attrs(isWriteAfterRead() ? Intrinsic::loop_dependence_war_mask : Intrinsic::loop_dependence_raw_mask, RetTy,
+                                {ArgTy, ArgTy});
+  return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPAliasLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index bf0c0ee0656a8..6826cbfa24594 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -28,8 +28,6 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
 ; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP10]]
 ; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; DEFAULT-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[X]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[TMP13:%.*]] = trunc <vscale x 8 x i32> [[BROADCAST_SPLAT]] to <vscale x 8 x i16>
@@ -60,7 +58,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP38]], i64 [[TMP42]]
 ; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP36]], ptr [[TMP38]], align 1
 ; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP37]], ptr [[TMP43]], align 1
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
 ; DEFAULT-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
@@ -101,12 +99,6 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
 ; PRED-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
-; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
-; PRED-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP5]], 1
-; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP8]]
-; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
-; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[X]], i64 0
@@ -134,9 +126,9 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP26]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP28:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; PRED-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 16 x i1> [[TMP28]], i32 0
-; PRED-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PRED-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; PRED-NEXT:    [[TMP27:%.*]] = xor i1 [[TMP25]], true
+; PRED-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
 ; PRED:       [[SCALAR_PH]]:
@@ -270,9 +262,6 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
 ; PRED-NEXT:    br i1 [[TMP12]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
-; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1
-; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
-; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 2
 ; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 2
 ; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
@@ -304,9 +293,9 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED:       [[PRED_STORE_CONTINUE2]]:
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP24:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; PRED-NEXT:    [[TMP25:%.*]] = xor i1 [[TMP24]], true
 ; PRED-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
-; PRED-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[TMP24]], i32 0
 ; PRED-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
@@ -441,9 +430,6 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
 ; PRED-NEXT:    br i1 [[TMP13]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
-; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
-; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
-; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP14:%.*]] = sub i64 [[TMP0]], 4
 ; PRED-NEXT:    [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4
 ; PRED-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
@@ -496,9 +482,9 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED:       [[PRED_STORE_CONTINUE7]]:
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]])
-; PRED-NEXT:    [[TMP35:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; PRED-NEXT:    [[TMP36:%.*]] = xor i1 [[TMP35]], true
 ; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; PRED-NEXT:    [[TMP36:%.*]] = extractelement <4 x i1> [[TMP35]], i32 0
 ; PRED-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
@@ -635,9 +621,6 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
 ; PRED-NEXT:    br i1 [[TMP12]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
-; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
-; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
-; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 4
 ; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 4
 ; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
@@ -690,9 +673,9 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED:       [[PRED_STORE_CONTINUE6]]:
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP34:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; PRED-NEXT:    [[TMP35:%.*]] = xor i1 [[TMP34]], true
 ; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; PRED-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP34]], i32 0
 ; PRED-NEXT:    br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]

>From 569f6298eb6aca1cab4d858713ec0719e663ee61 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Aug 2025 15:28:52 +0100
Subject: [PATCH 05/15] Remove VF from useSafeEltsMask

---
 llvm/include/llvm/Analysis/TargetTransformInfo.h       | 2 +-
 llvm/include/llvm/Analysis/TargetTransformInfoImpl.h   | 2 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp              | 4 ++--
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 2 +-
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h   | 2 +-
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp        | 8 ++++----
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index df18f223b58e5..09f8dbf2fb917 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1357,7 +1357,7 @@ class TargetTransformInfo {
   /// \return true if a mask should be formed that disables lanes that could
   /// alias between two pointers. The mask is created by the
   /// loop_dependence_{war,raw}_mask intrinsics.
-  LLVM_ABI bool useSafeEltsMask(ElementCount VF) const;
+  LLVM_ABI bool useSafeEltsMask() const;
 
   /// \return The maximum interleave factor that any transform should try to
   /// perform for this target. This number depends on the level of parallelism
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index e2ccf477fad4b..65c764a396133 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -660,7 +660,7 @@ class TargetTransformInfoImplBase {
     return InstructionCost::getInvalid();
   }
 
-  virtual bool useSafeEltsMask(ElementCount VF) const { return false; }
+  virtual bool useSafeEltsMask() const { return false; }
 
   virtual unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
 
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index e7a47d608d30a..fd5327480b5d0 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -878,8 +878,8 @@ InstructionCost TargetTransformInfo::getPartialReductionCost(
                                           BinOp, CostKind);
 }
 
-bool TargetTransformInfo::useSafeEltsMask(ElementCount VF) const {
-  return TTIImpl->useSafeEltsMask(VF);
+bool TargetTransformInfo::useSafeEltsMask() const {
+  return TTIImpl->useSafeEltsMask();
 }
 
 unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 551c2135be15f..e8c363087d686 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5598,7 +5598,7 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
   return Cost;
 }
 
-bool AArch64TTIImpl::useSafeEltsMask(ElementCount VF) const {
+bool AArch64TTIImpl::useSafeEltsMask() const {
   // The whilewr/rw instructions require SVE2
   return ST->hasSVE2();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 0ba739a86b77c..f3a7c39fabe20 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -388,7 +388,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
       TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
       TTI::TargetCostKind CostKind) const override;
 
-  bool useSafeEltsMask(ElementCount VF) const override;
+  bool useSafeEltsMask() const override;
 
   bool enableOrderedReductions() const override { return true; }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 649fa3e3f8e24..7cb4320661f5c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2077,9 +2077,9 @@ static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
 }
 
 static bool useSafeEltsMask(TailFoldingStyle TFStyle, RTCheckStyle Style,
-                            ElementCount VF, const TargetTransformInfo &TTI) {
+                            const TargetTransformInfo &TTI) {
   return useActiveLaneMask(TFStyle) && Style == RTCheckStyle::UseSafeEltsMask &&
-         TTI.useSafeEltsMask(VF);
+         TTI.useSafeEltsMask();
 }
 
 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
@@ -6730,7 +6730,7 @@ void LoopVectorizationPlanner::plan(
   ArrayRef<PointerDiffInfo> DiffChecks;
   auto TFStyle = CM.getTailFoldingStyle();
   if (RTChecks.has_value() &&
-      useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle), UserVF, TTI))
+      useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle), TTI))
     DiffChecks = *RTChecks;
 
   // Invalidate interleave groups if all blocks of loop will be predicated.
@@ -10149,7 +10149,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     if (VF.Width.isVector() || SelectedIC > 1) {
       TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
       bool UseSafeEltsMask =
-          useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle), VF.Width, *TTI);
+          useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle), *TTI);
       if (UseSafeEltsMask)
         LoopsAliasMasked++;
       Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC,

>From 10aa549545b8e73de0a32f1ce86fa3620e11c5ba Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Aug 2025 15:29:22 +0100
Subject: [PATCH 06/15] Calculate expanded cost and allow the intrinsic for SME

---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      | 47 +++++++++++++++++++
 .../AArch64/AArch64TargetTransformInfo.cpp    |  4 +-
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index aa9d1f0a1ccea..ed1b9dca5a729 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2141,6 +2141,53 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       // Otherwise, fallback to default scalarization cost.
       break;
     }
+    case Intrinsic::loop_dependence_raw_mask:
+    case Intrinsic::loop_dependence_war_mask: {
+      InstructionCost Cost = 0;
+      Type *PtrTy = ICA.getArgTypes()[0];
+      bool IsReadAfterWrite = IID == Intrinsic::loop_dependence_raw_mask;
+
+      Cost +=
+          thisT()->getArithmeticInstrCost(Instruction::Sub, PtrTy, CostKind);
+      if (IsReadAfterWrite) {
+        IntrinsicCostAttributes AbsAttrs(Intrinsic::abs, PtrTy, {PtrTy}, {});
+        Cost += thisT()->getIntrinsicInstrCost(AbsAttrs, CostKind);
+      }
+
+      Cost +=
+          thisT()->getArithmeticInstrCost(Instruction::SDiv, PtrTy, CostKind);
+      Type *CmpTy =
+          getTLI()
+              ->getSetCCResultType(
+                  thisT()->getDataLayout(), RetTy->getContext(),
+                  getTLI()->getValueType(thisT()->getDataLayout(), PtrTy))
+              .getTypeForEVT(RetTy->getContext());
+      Cost += thisT()->getCmpSelInstrCost(
+          BinaryOperator::ICmp, CmpTy, PtrTy,
+          IsReadAfterWrite ? CmpInst::ICMP_EQ : CmpInst::ICMP_SLE, CostKind);
+
+      // The deconstructed active lane mask
+      VectorType *RetTyVec = cast<VectorType>(RetTy);
+      VectorType *SplatTy = cast<VectorType>(RetTyVec->getWithNewType(PtrTy));
+      Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SplatTy, SplatTy, {},
+                                      CostKind, 0, nullptr);
+      IntrinsicCostAttributes StepVecAttrs(Intrinsic::stepvector, SplatTy, {},
+                                           FMF);
+      Cost += thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SplatTy,
+                                          SplatTy, CmpInst::ICMP_ULT, CostKind);
+
+      Cost +=
+          thisT()->getCastInstrCost(Instruction::CastOps::ZExt, RetTy, SplatTy,
+                                    TTI::CastContextHint::None, CostKind);
+      Cost += thisT()->getCastInstrCost(Instruction::CastOps::ZExt,
+                                        RetTyVec->getElementType(), CmpTy,
+                                        TTI::CastContextHint::None, CostKind);
+      Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, RetTyVec, RetTyVec, {},
+                                      CostKind, 0, nullptr);
+      Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
+      return Cost;
+    }
     }
 
     // Assume that we need to scalarize this intrinsic.)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e8c363087d686..22ed3e04fdf15 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1001,9 +1001,9 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   }
   case Intrinsic::loop_dependence_raw_mask:
   case Intrinsic::loop_dependence_war_mask:
-    if (ST->hasSVE2())
+    if (ST->hasSVE2() || ST->hasSME())
       return 1;
-    return InstructionCost::getInvalid(CostKind);
+    break;
   default:
     break;
   }

>From 26349a7a513b488b778143105f1c32cb27d8488d Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Aug 2025 15:33:04 +0100
Subject: [PATCH 07/15] Improve WriteAfterRead assignment

---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 43fa20b3c81af..eed635212993d 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -482,7 +482,7 @@ bool RuntimePointerChecking::tryToCreateDiffCheck(
     }
   }
 
-  bool WriteAfterRead = isa<LoadInst>(SrcInsts[0]);
+  bool WriteAfterRead = !Src->IsWritePtr && Sink->IsWritePtr;
 
   LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
                     << "SrcStart: " << *SrcStartInt << '\n'

>From 7ca41c89a3fdcd0fd969c2b51f363600401ab3a7 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Aug 2025 15:36:23 +0100
Subject: [PATCH 08/15] whilewr/rw accept sme as well

---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 22ed3e04fdf15..9932605dd3d17 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1001,6 +1001,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   }
   case Intrinsic::loop_dependence_raw_mask:
   case Intrinsic::loop_dependence_war_mask:
+    // The whilewr/rw instructions require SVE2
     if (ST->hasSVE2() || ST->hasSME())
       return 1;
     break;
@@ -5600,7 +5601,7 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
 
 bool AArch64TTIImpl::useSafeEltsMask() const {
   // The whilewr/rw instructions require SVE2
-  return ST->hasSVE2();
+  return ST->hasSVE2() || ST->hasSME();
 }
 
 InstructionCost

>From 61e5d8e2820454242d06f949b7c57b60308ffa57 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Aug 2025 16:26:15 +0100
Subject: [PATCH 09/15] Add addSafeEltsRuntimeChecks

---
 .../include/llvm/Transforms/Utils/LoopUtils.h |  13 +-
 llvm/lib/Transforms/Utils/LoopUtils.cpp       | 130 ++++++++++--------
 .../Transforms/Vectorize/LoopVectorize.cpp    |   7 +-
 3 files changed, 86 insertions(+), 64 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index e9515c32b0603..a1172ba2557dd 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -569,11 +569,14 @@ addRuntimeChecks(Instruction *Loc, Loop *TheLoop,
                  const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,
                  SCEVExpander &Expander, bool HoistRuntimeChecks = false);
 
-LLVM_ABI Value *
-addDiffRuntimeChecks(Instruction *Loc, ArrayRef<PointerDiffInfo> Checks,
-                     SCEVExpander &Expander,
-                     function_ref<Value *(IRBuilderBase &, unsigned)> GetVF,
-                     unsigned IC, ElementCount VF, bool UseSafeEltsMask);
+LLVM_ABI Value *addSafeEltsRuntimeChecks(Instruction *Loc,
+                                         ArrayRef<PointerDiffInfo> Checks,
+                                         SCEVExpander &Expander,
+                                         ElementCount VF);
+
+LLVM_ABI Value *addDiffRuntimeChecks(
+    Instruction *Loc, ArrayRef<PointerDiffInfo> Checks, SCEVExpander &Expander,
+    function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC);
 
 /// Struct to hold information about a partially invariant condition.
 struct IVConditionInfo {
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 248f6fa7d7f5e..db150d09d26f6 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -2020,10 +2020,61 @@ Value *llvm::addRuntimeChecks(
   return MemoryRuntimeCheck;
 }
 
+Value *llvm::addSafeEltsRuntimeChecks(Instruction *Loc,
+                                      ArrayRef<PointerDiffInfo> Checks,
+                                      SCEVExpander &Expander, ElementCount VF) {
+  IRBuilder ChkBuilder(Loc->getContext(),
+                       InstSimplifyFolder(Loc->getDataLayout()));
+  ChkBuilder.SetInsertPoint(Loc);
+  Value *MemoryRuntimeCheck = nullptr;
+
+  // Map to keep track of created compares, The key is the pair of operands for
+  // the compare, to allow detecting and re-using redundant compares.
+  DenseMap<std::pair<Value *, Value *>, Value *> SeenCompares;
+  Value *AliasLaneMask = nullptr;
+  for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze,
+                    WriteAfterRead] : Checks) {
+    Type *Ty = SinkStart->getType();
+    Value *Sink = Expander.expandCodeFor(SinkStart, Ty, Loc);
+    Value *Src = Expander.expandCodeFor(SrcStart, Ty, Loc);
+    if (SeenCompares.lookup({Sink, Src}))
+      continue;
+
+    unsigned IntOpc = WriteAfterRead ? Intrinsic::loop_dependence_war_mask
+                                     : Intrinsic::loop_dependence_raw_mask;
+    Value *SourceAsPtr = ChkBuilder.CreateCast(Instruction::IntToPtr, Src,
+                                               ChkBuilder.getPtrTy());
+    Value *SinkAsPtr = ChkBuilder.CreateCast(Instruction::IntToPtr, Sink,
+                                             ChkBuilder.getPtrTy());
+    Value *M = ChkBuilder.CreateIntrinsic(
+        IntOpc, {VectorType::get(ChkBuilder.getInt1Ty(), VF)},
+        {SourceAsPtr, SinkAsPtr, ChkBuilder.getInt64(AccessSize)}, nullptr,
+        "alias.lane.mask");
+    SeenCompares.insert({{Sink, Src}, M});
+    if (AliasLaneMask)
+      M = ChkBuilder.CreateAnd(AliasLaneMask, M);
+    else
+      AliasLaneMask = M;
+  }
+  assert(AliasLaneMask && "Expected an alias lane mask to have been created.");
+  auto *VecVT = VectorType::get(ChkBuilder.getInt1Ty(), VF);
+  // Extend to an i8 since i1 is too small to add with
+  Value *PopCount = ChkBuilder.CreateCast(
+      Instruction::ZExt, AliasLaneMask,
+      VectorType::get(ChkBuilder.getInt8Ty(), VecVT->getElementCount()));
+
+  PopCount =
+      ChkBuilder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, PopCount);
+  PopCount = ChkBuilder.CreateCast(Instruction::ZExt, PopCount,
+                                   ChkBuilder.getInt64Ty());
+  MemoryRuntimeCheck = ChkBuilder.CreateICmpUGT(
+      PopCount, ConstantInt::get(ChkBuilder.getInt64Ty(), 0));
+  return MemoryRuntimeCheck;
+}
+
 Value *llvm::addDiffRuntimeChecks(
     Instruction *Loc, ArrayRef<PointerDiffInfo> Checks, SCEVExpander &Expander,
-    function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC,
-    ElementCount VF, bool UseSafeEltsMask) {
+    function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC) {
 
   LLVMContext &Ctx = Loc->getContext();
   IRBuilder ChkBuilder(Ctx, InstSimplifyFolder(Loc->getDataLayout()));
@@ -2035,68 +2086,33 @@ Value *llvm::addDiffRuntimeChecks(
   // Map to keep track of created compares, The key is the pair of operands for
   // the compare, to allow detecting and re-using redundant compares.
   DenseMap<std::pair<Value *, Value *>, Value *> SeenCompares;
-  Value *AliasLaneMask = nullptr;
   for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze,
                     WriteAfterRead] : Checks) {
     Type *Ty = SinkStart->getType();
-    if (!VF.isScalar() && UseSafeEltsMask) {
-      Value *Sink = Expander.expandCodeFor(SinkStart, Ty, Loc);
-      Value *Src = Expander.expandCodeFor(SrcStart, Ty, Loc);
-      unsigned IntOpc = WriteAfterRead ? Intrinsic::loop_dependence_war_mask
-                                       : Intrinsic::loop_dependence_raw_mask;
-      Value *SourceAsPtr = ChkBuilder.CreateCast(Instruction::IntToPtr, Src,
-                                                 ChkBuilder.getPtrTy());
-      Value *SinkAsPtr = ChkBuilder.CreateCast(Instruction::IntToPtr, Sink,
-                                               ChkBuilder.getPtrTy());
-      Value *M = ChkBuilder.CreateIntrinsic(
-          IntOpc, {VectorType::get(ChkBuilder.getInt1Ty(), VF)},
-          {SourceAsPtr, SinkAsPtr, ChkBuilder.getInt64(AccessSize)}, nullptr,
-          "alias.lane.mask");
-      if (AliasLaneMask)
-        M = ChkBuilder.CreateAnd(AliasLaneMask, M);
-      else
-        AliasLaneMask = M;
-    } else {
-      // Compute VF * IC * AccessSize.
-      auto *VFTimesICTimesSize =
-          ChkBuilder.CreateMul(GetVF(ChkBuilder, Ty->getScalarSizeInBits()),
-                               ConstantInt::get(Ty, IC * AccessSize));
-      Value *Diff =
-          Expander.expandCodeFor(SE.getMinusSCEV(SinkStart, SrcStart), Ty, Loc);
-
-      // Check if the same compare has already been created earlier. In that
-      // case, there is no need to check it again.
-      Value *IsConflict = SeenCompares.lookup({Diff, VFTimesICTimesSize});
-      if (IsConflict)
-        continue;
+    // Compute VF * IC * AccessSize.
+    auto *VFTimesICTimesSize =
+        ChkBuilder.CreateMul(GetVF(ChkBuilder, Ty->getScalarSizeInBits()),
+                             ConstantInt::get(Ty, IC * AccessSize));
+    Value *Diff =
+        Expander.expandCodeFor(SE.getMinusSCEV(SinkStart, SrcStart), Ty, Loc);
+
+    // Check if the same compare has already been created earlier. In that
+    // case, there is no need to check it again.
+    Value *IsConflict = SeenCompares.lookup({Diff, VFTimesICTimesSize});
+    if (IsConflict)
+      continue;
 
+    IsConflict =
+        ChkBuilder.CreateICmpULT(Diff, VFTimesICTimesSize, "diff.check");
+    SeenCompares.insert({{Diff, VFTimesICTimesSize}, IsConflict});
+    if (NeedsFreeze)
+      IsConflict =
+          ChkBuilder.CreateFreeze(IsConflict, IsConflict->getName() + ".fr");
+    if (MemoryRuntimeCheck) {
       IsConflict =
-          ChkBuilder.CreateICmpULT(Diff, VFTimesICTimesSize, "diff.check");
-      SeenCompares.insert({{Diff, VFTimesICTimesSize}, IsConflict});
-      if (NeedsFreeze)
-        IsConflict =
-            ChkBuilder.CreateFreeze(IsConflict, IsConflict->getName() + ".fr");
-      if (MemoryRuntimeCheck) {
-        IsConflict =
-            ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
+          ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
       }
       MemoryRuntimeCheck = IsConflict;
-    }
-  }
-
-  if (AliasLaneMask) {
-    auto *VecVT = VectorType::get(ChkBuilder.getInt1Ty(), VF);
-    // Extend to an i8 since i1 is too small to add with
-    Value *PopCount = ChkBuilder.CreateCast(
-        Instruction::ZExt, AliasLaneMask,
-        VectorType::get(ChkBuilder.getInt8Ty(), VecVT->getElementCount()));
-
-    PopCount =
-        ChkBuilder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, PopCount);
-    PopCount = ChkBuilder.CreateCast(Instruction::ZExt, PopCount,
-                                     ChkBuilder.getInt64Ty());
-    MemoryRuntimeCheck = ChkBuilder.CreateICmpUGT(
-        PopCount, ConstantInt::get(ChkBuilder.getInt64Ty(), 0));
   }
 
   return MemoryRuntimeCheck;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7cb4320661f5c..106e5d27bb497 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1867,7 +1867,10 @@ class GeneratedRTChecks {
                                  "vector.memcheck");
 
       auto DiffChecks = RtPtrChecking.getDiffChecks();
-      if (DiffChecks) {
+      if (UseSafeEltsMask) {
+        MemRuntimeCheckCond = addSafeEltsRuntimeChecks(
+            MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, VF);
+      } else if (DiffChecks) {
         Value *RuntimeVF = nullptr;
         MemRuntimeCheckCond = addDiffRuntimeChecks(
             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
@@ -1876,7 +1879,7 @@ class GeneratedRTChecks {
                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
               return RuntimeVF;
             },
-            IC, VF, UseSafeEltsMask);
+            IC);
       } else {
         MemRuntimeCheckCond = addRuntimeChecks(
             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),

>From d38da8d531d6bed53e14e2dcdb0ce3f629687b0a Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Aug 2025 17:09:47 +0100
Subject: [PATCH 10/15] Move RTChecks check closer to use

---
 .../Vectorize/LoopVectorizationPlanner.h      | 12 ++-----
 .../Transforms/Vectorize/LoopVectorize.cpp    | 34 +++++++++----------
 2 files changed, 18 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 4911d2b69e8f8..7671b6acc7535 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -582,21 +582,13 @@ class LoopVectorizationPlanner {
   /// set the largest included VF to the maximum VF for which no plan could be
   /// built. Each VPlan is built starting from a copy of \p InitialPlan, which
   /// is a plain CFG VPlan wrapping the original scalar loop.
-  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
-  /// combining the resulting predicate with an active lane mask if one is in
-  /// use.
   VPlanPtr tryToBuildVPlanWithVPRecipes(VPlanPtr InitialPlan, VFRange &Range,
-                                        LoopVersioning *LVer,
-                                        ArrayRef<PointerDiffInfo> RTChecks);
+                                        LoopVersioning *LVer);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
-  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
-  /// combining the resulting predicate with an active lane mask if one is in
-  /// use.
-  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF,
-                                ArrayRef<PointerDiffInfo> RTChecks);
+  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
 
   // Adjust the recipes for reductions. For in-loop reductions the chain of
   // instructions leading from the loop exit instr to the phi need to be
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 106e5d27bb497..0ed7fefba0fad 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2396,6 +2396,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
                                    static_cast<DominatorTree *>(nullptr), LI,
                                    nullptr, "vector.ph");
+
   BranchInst &BI =
       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
@@ -6730,12 +6731,6 @@ void LoopVectorizationPlanner::plan(
   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
     return;
 
-  ArrayRef<PointerDiffInfo> DiffChecks;
-  auto TFStyle = CM.getTailFoldingStyle();
-  if (RTChecks.has_value() &&
-      useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle), TTI))
-    DiffChecks = *RTChecks;
-
   // Invalidate interleave groups if all blocks of loop will be predicated.
   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
       !useMaskedInterleavedAccesses(TTI)) {
@@ -6768,7 +6763,7 @@ void LoopVectorizationPlanner::plan(
       CM.collectInLoopReductions();
       if (CM.selectUserVectorizationFactor(UserVF)) {
         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-        buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecks);
+        buildVPlansWithVPRecipes(UserVF, UserVF);
         LLVM_DEBUG(printPlans(dbgs()));
         return;
       }
@@ -6792,10 +6787,8 @@ void LoopVectorizationPlanner::plan(
     CM.collectNonVectorizedAndSetWideningDecisions(VF);
   }
 
-  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
-                           DiffChecks);
-  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
-                           DiffChecks);
+  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
+  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
 
   LLVM_DEBUG(printPlans(dbgs()));
 }
@@ -8402,9 +8395,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
                                       ScaleFactor, Reduction);
 }
 
-void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
-    ElementCount MinVF, ElementCount MaxVF,
-    ArrayRef<PointerDiffInfo> DiffChecks) {
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
+                                                        ElementCount MaxVF) {
   if (ElementCount::isKnownGT(MinVF, MaxVF))
     return;
 
@@ -8430,8 +8422,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
     if (auto Plan = tryToBuildVPlanWithVPRecipes(
-            std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer,
-            DiffChecks)) {
+            std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
       bool HasScalarVF = Plan->hasScalarVFOnly();
       // Now optimize the initial VPlan.
       if (!HasScalarVF)
@@ -8656,8 +8647,7 @@ static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
 }
 
 VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
-    VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer,
-    ArrayRef<PointerDiffInfo> DiffChecks) {
+    VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
 
   using namespace llvm::VPlanPatternMatch;
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
@@ -8945,6 +8935,14 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
     bool WithoutRuntimeCheck =
         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+
+    ArrayRef<PointerDiffInfo> DiffChecks;
+    std::optional<ArrayRef<PointerDiffInfo>> RTChecks =
+        CM.Legal->getRuntimePointerChecking()->getDiffChecks();
+    if (RTChecks.has_value() &&
+        useSafeEltsMask(Style, CM.getRTCheckStyle(Style), TTI))
+      DiffChecks = *RTChecks;
+
     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
                                        WithoutRuntimeCheck, PSE, DiffChecks);
   }

>From 6a8e59f25433457415a38f80fd7c83b5b27b40df Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Aug 2025 17:10:35 +0100
Subject: [PATCH 11/15] Don't run extra passes in tests

---
 .../LoopVectorize/AArch64/alias_mask.ll       | 76 ++++++++++++++-----
 1 file changed, 59 insertions(+), 17 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
index acd6beb303954..a9726778cceac 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -1,10 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --filter-out-after "^scalar.ph:" --version 4
-; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize,instcombine,early-cse -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s
 
 define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-LABEL: define dso_local void @alias_mask(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B4:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C]] to i64
 ; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i64 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
@@ -13,19 +17,35 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[B]], ptr [[C]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP0]])
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[DOTNOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[B4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[C3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT6]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK7:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP12]], ptr [[TMP9]], i64 1)
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK7]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP10]])
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP11]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP8]], 4
-; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP8]], 16
+; CHECK-NEXT:    [[TMP15:%.*]] = sub i64 [[N]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ugt i64 [[N]], [[TMP14]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP17]], i64 [[TMP15]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP25:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]]
+; CHECK-NEXT:    [[TMP25:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK7]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP25]], <vscale x 16 x i8> poison)
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
@@ -35,8 +55,9 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP20]], ptr [[TMP21]], i32 1, <vscale x 16 x i1> [[TMP25]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP24]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP13]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = xor i1 [[TMP23]], true
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ;
 entry:
@@ -64,6 +85,10 @@ define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-LABEL: define i32 @alias_mask_read_after_write(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C4:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C2:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[CMP19:%.*]] = icmp sgt i64 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
@@ -72,20 +97,36 @@ define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.raw.mask.nxv4i1(ptr [[C]], ptr [[B]], i64 4)
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 4 x i1> [[ALIAS_LANE_MASK]] to <vscale x 4 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP0]])
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[DOTNOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[C4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[B3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT6]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK7:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.raw.mask.nxv4i1(ptr [[TMP8]], ptr [[TMP12]], i64 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <vscale x 4 x i1> [[ALIAS_LANE_MASK7]] to <vscale x 4 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP10]])
+; CHECK-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP11]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP9]], 2
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = sub i64 [[N]], [[TMP18]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ugt i64 [[N]], [[TMP18]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP31:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK7]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP17]], i32 2, <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
@@ -97,8 +138,9 @@ define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-NEXT:    [[TMP25]] = select <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP27]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP13]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = xor i1 [[TMP28]], true
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ;
 entry:

>From 8a077e9bede4053921fc9561f60df263debc68ca Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Aug 2025 17:18:21 +0100
Subject: [PATCH 12/15] Add tests with multiple aliasing pairs

---
 .../LoopVectorize/AArch64/alias_mask.ll       | 190 ++++++++++++++++++
 1 file changed, 190 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
index a9726778cceac..c7344f85f8e02 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -166,3 +166,193 @@ exit:                        ; preds = %entry, %for.body
   %result = phi i32 [ 0, %entry ], [ %add2, %for.body ]
   ret i32 %result
 }
+
+define dso_local void @alias_mask_multiple(ptr %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define dso_local void @alias_mask_multiple(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B7:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[A6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[C5:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[A]], ptr [[C]], i64 1)
+; CHECK-NEXT:    [[ALIAS_LANE_MASK4:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[B]], ptr [[C]], i64 1)
+; CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 16 x i1> [[ALIAS_LANE_MASK]], [[ALIAS_LANE_MASK4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[B7]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[A6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT8]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[C5]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT10]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT11]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT9]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK12:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP9]], ptr [[TMP10]], i64 1)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT11]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 16 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK13:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP13]], ptr [[TMP14]], i64 1)
+; CHECK-NEXT:    [[TMP15:%.*]] = and <vscale x 16 x i1> [[ALIAS_LANE_MASK12]], [[ALIAS_LANE_MASK13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <vscale x 16 x i1> [[TMP15]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP16]])
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i8 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 16
+; CHECK-NEXT:    [[TMP21:%.*]] = sub i64 [[N]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ugt i64 [[N]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[TMP15]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP25]], i32 1, <vscale x 16 x i1> [[TMP24]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP26]], i32 1, <vscale x 16 x i1> [[TMP24]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP27:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP27]], ptr [[TMP28]], i32 1, <vscale x 16 x i1> [[TMP24]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP18]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP23]])
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT:    [[TMP30:%.*]] = xor i1 [[TMP29]], true
+; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+;
+entry:
+  %cmp11 = icmp sgt i64 %n, 0
+  br i1 %cmp11, label %for.body, label %exit
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %add = add i8 %load.b, %load.a
+  %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv
+  store i8 %add, ptr %gep.c, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                 ; preds = %for.body, %entry
+  ret void
+}
+
+define i32 @alias_mask_multiple_read_after_write(ptr %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define i32 @alias_mask_multiple_read_after_write(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B7:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[A6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[C5:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[CMP19:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.war.mask.nxv4i1(ptr [[A]], ptr [[C]], i64 4)
+; CHECK-NEXT:    [[ALIAS_LANE_MASK4:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.raw.mask.nxv4i1(ptr [[C]], ptr [[B]], i64 4)
+; CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 4 x i1> [[ALIAS_LANE_MASK]], [[ALIAS_LANE_MASK4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 4 x i1> [[ALIAS_LANE_MASK]] to <vscale x 4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[B7]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT8]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[C5]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT10]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT11]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT9]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK12:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.war.mask.nxv4i1(ptr [[TMP9]], ptr [[TMP10]], i64 4)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 4 x i64> [[BROADCAST_SPLAT11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK13:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.raw.mask.nxv4i1(ptr [[TMP13]], ptr [[TMP14]], i64 4)
+; CHECK-NEXT:    [[TMP15:%.*]] = and <vscale x 4 x i1> [[ALIAS_LANE_MASK12]], [[ALIAS_LANE_MASK13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <vscale x 4 x i1> [[TMP15]] to <vscale x 4 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP16]])
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i8 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 4
+; CHECK-NEXT:    [[TMP21:%.*]] = sub i64 [[N]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ugt i64 [[N]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[TMP15]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP25]], i32 2, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP26]], i32 2, <vscale x 4 x i1> [[TMP24]])
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP27]], i32 2, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP28:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add <vscale x 4 x i32> [[TMP28]], [[WIDE_MASKED_LOAD14]]
+; CHECK-NEXT:    [[TMP30]] = select <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> [[TMP29]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP18]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP23]])
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT:    [[TMP32:%.*]] = xor i1 [[TMP31]], true
+; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+;
+entry:
+  %cmp19 = icmp sgt i64 %n, 0
+  br i1 %cmp19, label %for.body, label %exit
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add2, %for.body ]
+  %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv
+  %load.a = load i32, ptr %gep.a, align 2
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %load.a, ptr %gep.c, align 2
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %load.b = load i32, ptr %gep.b, align 2
+  %add = add i32 %load.a, %accum
+  %add2 = add i32 %add, %load.b
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                        ; preds = %entry, %for.body
+  %result = phi i32 [ 0, %entry ], [ %add2, %for.body ]
+  ret i32 %result
+}

>From f2456107458624425ee91bcd21e54c7b23a42aff Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 15 Aug 2025 10:52:20 +0100
Subject: [PATCH 13/15] Remove LV::useSafeEltsMask

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 19 ++++++++-----------
 .../AArch64/induction-costs-sve.ll            | 12 ++++++------
 .../runtime-checks-difference.ll              |  5 ++---
 3 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0ed7fefba0fad..839a48511c9e2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1344,7 +1344,10 @@ class LoopVectorizationCostModel {
                                : ChosenTailFoldingStyle->second;
   }
 
-  RTCheckStyle getRTCheckStyle(TailFoldingStyle TFStyle) const {
+  RTCheckStyle getRTCheckStyle(TailFoldingStyle TFStyle, const TargetTransformInfo &TTI) const {
+    if (!TTI.useSafeEltsMask())
+      return RTCheckStyle::ScalarDifference;
+
     switch (TFStyle) {
     case TailFoldingStyle::Data:
     case TailFoldingStyle::DataAndControlFlow:
@@ -1355,8 +1358,8 @@ class LoopVectorizationCostModel {
     }
   }
 
-  RTCheckStyle getRTCheckStyle() const {
-    return getRTCheckStyle(getTailFoldingStyle());
+  RTCheckStyle getRTCheckStyle(const TargetTransformInfo &TTI) const {
+    return getRTCheckStyle(getTailFoldingStyle(), TTI);
   }
 
   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
@@ -2079,12 +2082,6 @@ static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
 }
 
-static bool useSafeEltsMask(TailFoldingStyle TFStyle, RTCheckStyle Style,
-                            const TargetTransformInfo &TTI) {
-  return useActiveLaneMask(TFStyle) && Style == RTCheckStyle::UseSafeEltsMask &&
-         TTI.useSafeEltsMask();
-}
-
 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
 // vectorization. The loop needs to be annotated with #pragma omp simd
 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
@@ -8940,7 +8937,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     std::optional<ArrayRef<PointerDiffInfo>> RTChecks =
         CM.Legal->getRuntimePointerChecking()->getDiffChecks();
     if (RTChecks.has_value() &&
-        useSafeEltsMask(Style, CM.getRTCheckStyle(Style), TTI))
+        CM.getRTCheckStyle(Style, TTI) == RTCheckStyle::UseSafeEltsMask)
       DiffChecks = *RTChecks;
 
     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
@@ -10150,7 +10147,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     if (VF.Width.isVector() || SelectedIC > 1) {
       TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
       bool UseSafeEltsMask =
-          useSafeEltsMask(TFStyle, CM.getRTCheckStyle(TFStyle), *TTI);
+          CM.getRTCheckStyle(TFStyle, *TTI) == RTCheckStyle::UseSafeEltsMask;
       if (UseSafeEltsMask)
         LoopsAliasMasked++;
       Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 6826cbfa24594..137e07336fd50 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -88,19 +88,19 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-LABEL: define void @iv_casts(
 ; PRED-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; PRED-NEXT:  [[ENTRY:.*]]:
-; PRED-NEXT:    [[SRC3:%.*]] = ptrtoint ptr [[SRC]] to i64
-; PRED-NEXT:    [[DST2:%.*]] = ptrtoint ptr [[DST]] to i64
+; PRED-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; PRED-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
 ; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; PRED-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; PRED:       [[VECTOR_MEMCHECK]]:
 ; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16
-; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST2]], [[SRC3]]
+; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
 ; PRED-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
-; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
+; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[X]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
@@ -124,7 +124,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP21]] to <vscale x 16 x i8>
 ; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
 ; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP26]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]])
 ; PRED-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; PRED-NEXT:    [[TMP27:%.*]] = xor i1 [[TMP25]], true
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
index e48979c4532a6..b640c1911cb0d 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
@@ -339,13 +339,12 @@ exit:
 ; of the outer loop as start value. It is sufficient to subtract the start
 ; values (%dst, %src) of the outer AddRecs.
 define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %m, i64 noundef %n) {
-;
 ; CHECK-LABEL: define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(
 ; CHECK-SAME: ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i64 noundef [[M:%.*]], i64 noundef [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; CHECK-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[DST1]], [[SRC2]]
 ; CHECK-NEXT:    br label %[[OUTER_LOOP:.*]]
 ; CHECK:       [[OUTER_LOOP]]:
 ; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ]
@@ -353,7 +352,7 @@ define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr noca
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
-; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[SUB]], 16
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], [[SCALAR_PH]], [[VECTOR_PH:label %.*]]
 ;
 entry:

>From 82acca740ce7f319e7287b94c7dbe89996cf67f1 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 15 Aug 2025 11:46:41 +0100
Subject: [PATCH 14/15] Format latest changes

---
 llvm/lib/Transforms/Utils/LoopUtils.cpp         |  4 ++--
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp |  3 ++-
 llvm/lib/Transforms/Vectorize/VPlan.h           |  4 ++--
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp  | 10 ++++++----
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index db150d09d26f6..f3abc01ad783c 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -2111,8 +2111,8 @@ Value *llvm::addDiffRuntimeChecks(
     if (MemoryRuntimeCheck) {
       IsConflict =
           ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
-      }
-      MemoryRuntimeCheck = IsConflict;
+    }
+    MemoryRuntimeCheck = IsConflict;
   }
 
   return MemoryRuntimeCheck;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 839a48511c9e2..de31489be9493 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1344,7 +1344,8 @@ class LoopVectorizationCostModel {
                                : ChosenTailFoldingStyle->second;
   }
 
-  RTCheckStyle getRTCheckStyle(TailFoldingStyle TFStyle, const TargetTransformInfo &TTI) const {
+  RTCheckStyle getRTCheckStyle(TailFoldingStyle TFStyle,
+                               const TargetTransformInfo &TTI) const {
     if (!TTI.useSafeEltsMask())
       return RTCheckStyle::ScalarDifference;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a4d5cfc164a84..641ad5170f8ff 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3277,8 +3277,8 @@ class VPAliasLaneMaskRecipe : public VPSingleDefRecipe {
 
   bool isWriteAfterRead() const { return WriteAfterRead; }
 
-InstructionCost computeCost(ElementCount VF,
-                                             VPCostContext &Ctx) const override;
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
 
 private:
   unsigned ElementSize;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bcc8b19913225..1c2b72e0aedc3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3793,12 +3793,14 @@ void VPAliasLaneMaskRecipe::execute(VPTransformState &State) {
   State.set(this, AliasMask, /*IsScalar=*/false);
 }
 
-InstructionCost
-VPAliasLaneMaskRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const {
+InstructionCost VPAliasLaneMaskRecipe::computeCost(ElementCount VF,
+                                                   VPCostContext &Ctx) const {
   Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
   Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
-  IntrinsicCostAttributes Attrs(isWriteAfterRead() ? Intrinsic::loop_dependence_war_mask : Intrinsic::loop_dependence_raw_mask, RetTy,
-                                {ArgTy, ArgTy});
+  IntrinsicCostAttributes Attrs(isWriteAfterRead()
+                                    ? Intrinsic::loop_dependence_war_mask
+                                    : Intrinsic::loop_dependence_raw_mask,
+                                RetTy, {ArgTy, ArgTy});
   return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
 }
 

>From 69afc95fce0333292ee29bafd71aa5a1052a1301 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 15 Aug 2025 15:49:20 +0100
Subject: [PATCH 15/15] Rebase

---
 .../LoopVectorize/AArch64/alias_mask.ll          | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
index c7344f85f8e02..8a46efce5f117 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -21,8 +21,6 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[B4]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[C3]], i64 0
@@ -36,7 +34,7 @@ define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP10]])
 ; CHECK-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP11]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP8]], 16
+; CHECK-NEXT:    [[TMP14:%.*]] = shl nuw i64 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = sub i64 [[N]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ugt i64 [[N]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP17]], i64 [[TMP15]], i64 0
@@ -101,8 +99,6 @@ define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[C4]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[B3]], i64 0
@@ -116,7 +112,7 @@ define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP10]])
 ; CHECK-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP11]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = shl nuw i64 [[TMP9]], 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = sub i64 [[N]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ugt i64 [[N]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0
@@ -191,8 +187,6 @@ define dso_local void @alias_mask_multiple(ptr %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[TMP3]], 0
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[B7]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[A6]], i64 0
@@ -214,7 +208,7 @@ define dso_local void @alias_mask_multiple(ptr %a, ptr %b, ptr %c, i64 %n) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP16]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext i8 [[TMP17]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 16
+; CHECK-NEXT:    [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = sub i64 [[N]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp ugt i64 [[N]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 0
@@ -283,8 +277,6 @@ define i32 @alias_mask_multiple_read_after_write(ptr %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[TMP3]], 0
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[B7]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A6]], i64 0
@@ -306,7 +298,7 @@ define i32 @alias_mask_multiple_read_after_write(ptr %a, ptr %b, ptr %c, i64 %n)
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP16]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext i8 [[TMP17]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 4
+; CHECK-NEXT:    [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 2
 ; CHECK-NEXT:    [[TMP21:%.*]] = sub i64 [[N]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp ugt i64 [[N]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 0