[llvm] [LV] Add initial support for partial alias masking (PR #177599)

Benjamin Maxwell via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 16 06:11:05 PST 2026


https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/177599

>From 6e324f5966dad22513b215516c13dc0ec175585a Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 20 Jan 2026 18:53:34 +0000
Subject: [PATCH] [LV] Add initial support for partial alias masking

This patch adds initial support for partial alias masking, which allows
entering the vector loop even when there is aliasing within a single
vector iteration. It does this by clamping the VF to the safe distance
between pointers. This allows the runtime VF to be anywhere from 2 to
the "static" VF.

Conceptually, this transform looks like:

```
  // `c` and `b` may alias.
  for (int i = 0; i < n; i++) {
    c[i] = a[i] + b[i];
  }
```

->

```
  svbool_t alias_mask = loop.dependence.war.mask(b, c);
  int num_active = num_active_lanes(mask);
  if (num_active >= 2) {
    for (int i = 0; i < n; i += num_active) {
      // ... vector loop masked with `alias_mask`
    }
  }
  // ... scalar tail
```

Alias masking can be used both with/without tail folding, however has
the current patch as a few limitations:

- Currently, the mask and transform is only valid for IC = 1
  * Some recipes may not handle the "ClampedVF" correctly at IC > 1
  * On AArch64, we only have native alias mask instructions for IC = 1
- The style of vectorization is not enabled by default/costed
  * It can be enabled with `-enable-partial-aliasing-vectorization`
  * When enable, alias masking is used instead of the standard diff
    checks (when legal to do so)

	# Please enter the commit message for your changes. Lines starting
---
 .../Vectorize/LoopVectorizationLegality.h     |  12 +-
 llvm/lib/Analysis/VectorUtils.cpp             |   2 +
 .../Vectorize/LoopVectorizationLegality.cpp   |  15 +-
 .../Vectorize/LoopVectorizationPlanner.h      |   4 +
 .../Transforms/Vectorize/LoopVectorize.cpp    | 139 +++-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  15 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  12 +-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  13 +-
 llvm/lib/Transforms/Vectorize/VPlanAnalysis.h |   1 +
 .../Vectorize/VPlanConstruction.cpp           |  10 +-
 llvm/lib/Transforms/Vectorize/VPlanHelpers.h  |  12 +-
 .../Transforms/Vectorize/VPlanPredicator.cpp  |  57 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  33 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  55 ++
 .../Transforms/Vectorize/VPlanTransforms.h    |  11 +-
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp  |  32 +-
 llvm/lib/Transforms/Vectorize/VPlanUtils.h    |   4 +
 .../LoopVectorize/AArch64/alias-mask.ll       | 689 ++++++++++++++++++
 .../AArch64/vplan-printing-alias-mask.ll      | 290 ++++++++
 .../LoopVectorize/VPlan/vplan-printing.ll     |  12 +-
 .../LoopVectorize/pointer-induction.ll        |   6 +-
 .../reuse-lcssa-phi-scev-expansion.ll         |  12 +-
 .../vplan-printing-alias-mask.ll              | 172 +++++
 23 files changed, 1534 insertions(+), 74 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/alias-mask.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing-alias-mask.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/vplan-printing-alias-mask.ll

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index f82fc588639dd..fe5f21ac61274 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -300,9 +300,17 @@ class LoopVectorizationLegality {
   /// masking.
   bool canFoldTailByMasking() const;
 
+  /// Returns true if all instructions in the loop support masking or
+  /// speculation.
+  ///
+  /// The mask may be loop-invariant if it represents a maximum safe dependence
+  /// distance (alias mask) or loop-variant if it is based on the induction
+  /// variable (e.g. tail-folding).
+  bool canMaskLoop() const;
+
   /// Mark all respective loads/stores for masking. Must only be called when
-  /// tail-folding is possible.
-  void prepareToFoldTailByMasking();
+  /// masking is possible.
+  void prepareToMaskLoop();
 
   /// Returns the primary induction variable.
   PHINode *getPrimaryInduction() { return PrimaryInduction; }
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index d4083c49626fe..e3cf650ddb76b 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -170,6 +170,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
     return (ScalarOpdIdx == 2);
   case Intrinsic::experimental_vp_splice:
     return ScalarOpdIdx == 2 || ScalarOpdIdx == 4;
+  case Intrinsic::loop_dependence_war_mask:
+    return true;
   default:
     return false;
   }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index e57e0cf636501..66638dec9256b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -2131,6 +2131,15 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
     }
   }
 
+  if (!canMaskLoop())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
+
+  return true;
+}
+
+bool LoopVectorizationLegality::canMaskLoop() const {
   // The list of pointers that we can safely read and write to remains empty.
   SmallPtrSet<Value *, 8> SafePointers;
 
@@ -2139,17 +2148,15 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
   SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
   for (BasicBlock *BB : TheLoop->blocks()) {
     if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp)) {
-      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking.\n");
+      LLVM_DEBUG(dbgs() << "LV: Cannot mask loop.\n");
       return false;
     }
   }
 
-  LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
-
   return true;
 }
 
-void LoopVectorizationLegality::prepareToFoldTailByMasking() {
+void LoopVectorizationLegality::prepareToMaskLoop() {
   // The list of pointers that we can safely read and write to remains empty.
   SmallPtrSet<Value *, 8> SafePointers;
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 54bb073eb4f81..1019849b1d011 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -668,6 +668,10 @@ class LoopVectorizationPlanner {
   void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks,
                            bool HasBranchWeights) const;
 
+  VPValue *materializeAliasMask(VPlan &Plan,
+                                ArrayRef<PointerDiffInfo> DiffChecks,
+                                bool HasBranchWeights);
+
 #ifndef NDEBUG
   /// \return The most profitable vectorization factor for the available VPlans
   /// and the cost of that VF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 72400e1055427..b528c9723427d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -170,6 +170,8 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
 STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
+STATISTIC(LoopsPartialAliasVectorized,
+          "Number of partial aliasing loops vectorized");
 
 static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -198,6 +200,10 @@ static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
     cl::desc("The maximum allowed number of runtime memory checks"));
 
+static cl::opt<bool> ForcePartialAliasingVectorization(
+    "force-partial-aliasing-vectorization", cl::init(false), cl::Hidden,
+    cl::desc("Replace pointer diff checks with alias masks."));
+
 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
 // that predication is preferred, and this lists all options. I.e., the
 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
@@ -1381,6 +1387,44 @@ class LoopVectorizationCostModel {
     return getTailFoldingStyle() != TailFoldingStyle::None;
   }
 
+  /// Returns true if all loop blocks should be masked.
+  bool allLoopBlocksMasked() const {
+    return foldTailByMasking() || maskPartialAliasing();
+  }
+
+  void checkIfPartialAliasMaskingIsEnabled() {
+    assert(!IsPartialAliasMaskingEnabled &&
+           "Partial alias masking already checked!");
+    if (!ForcePartialAliasingVectorization || !Legal->canMaskLoop()) {
+      // Option not enabled (or loop cannot be masked).
+      IsPartialAliasMaskingEnabled = false;
+      return;
+    }
+    const RuntimePointerChecking *Checks = Legal->getRuntimePointerChecking();
+    if (!Checks) {
+      // Runtime checks not needed for this loop (no alias mask required).
+      IsPartialAliasMaskingEnabled = false;
+      return;
+    }
+    if (auto DiffChecks = Checks->getDiffChecks()) {
+      // We have diff checks. We can use an alias mask.
+      IsPartialAliasMaskingEnabled = !DiffChecks->empty();
+      return;
+    }
+    // Runtime checks are not diff checks (can't be replaced with alias mask).
+    IsPartialAliasMaskingEnabled = false;
+  }
+
+  void disablePartialAliasMaskingIfEnabled() {
+    if (IsPartialAliasMaskingEnabled)
+      IsPartialAliasMaskingEnabled = false;
+  }
+
+  /// Returns true if all loop blocks should have partial aliases masked.
+  bool maskPartialAliasing() const {
+    return IsPartialAliasMaskingEnabled.value_or(false);
+  }
+
   /// Returns true if the use of wide lane masks is requested and the loop is
   /// using tail-folding with a lane mask for control flow.
   bool useWideActiveLaneMask() const {
@@ -1405,7 +1449,7 @@ class LoopVectorizationCostModel {
   /// for any reason, e.g. because tail folding now requires a predicate
   /// or because the block in the original loop was predicated.
   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
-    return foldTailByMasking() || Legal->blockNeedsPredication(BB);
+    return allLoopBlocksMasked() || Legal->blockNeedsPredication(BB);
   }
 
   /// Returns true if VP intrinsics with explicit vector length support should
@@ -1599,6 +1643,9 @@ class LoopVectorizationCostModel {
   std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
       ChosenTailFoldingStyle;
 
+  /// true if partial alias masking is enabled (nullopt = undecided).
+  std::optional<bool> IsPartialAliasMaskingEnabled;
+
   /// true if scalable vectorization is supported and enabled.
   std::optional<bool> IsScalableVectorizationAllowed;
 
@@ -1820,14 +1867,18 @@ class GeneratedRTChecks {
   /// The kind of cost that we are calculating
   TTI::TargetCostKind CostKind;
 
+  /// True if the loop is alias-masked (which allows us to omit diff checks).
+  bool LoopUsesAliasMasking = false;
+
 public:
   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
                     LoopInfo *LI, TargetTransformInfo *TTI,
-                    TTI::TargetCostKind CostKind)
+                    TTI::TargetCostKind CostKind, bool LoopUsesAliasMasking)
       : DT(DT), LI(LI), TTI(TTI),
         SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
         MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
-        PSE(PSE), CostKind(CostKind) {}
+        PSE(PSE), CostKind(CostKind),
+        LoopUsesAliasMasking(LoopUsesAliasMasking) {}
 
   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
   /// accurately estimate the cost of the runtime checks. The blocks are
@@ -1880,7 +1931,7 @@ class GeneratedRTChecks {
     }
 
     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
-    if (RtPtrChecking.Need) {
+    if (RtPtrChecking.Need && !LoopUsesAliasMasking) {
       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
                                  "vector.memcheck");
@@ -2878,8 +2929,8 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
   if (Legal->blockNeedsPredication(I->getParent()))
     return true;
 
-  // If we're not folding the tail by masking, predication is unnecessary.
-  if (!foldTailByMasking())
+  // If we're not masking, predication is unnecessary.
+  if (!allLoopBlocksMasked())
     return false;
 
   // All that remain are instructions with side-effects originally executed in
@@ -3083,10 +3134,17 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
   auto *Ptr = getLoadStorePointerOperand(I);
   auto *ScalarTy = getLoadStoreType(I);
 
+  int Stride = Legal->isConsecutivePtr(ScalarTy, Ptr);
   // In order to be widened, the pointer should be consecutive, first of all.
-  if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
+  if (!Stride)
     return false;
 
+  // Currently, we can't handle alias masking in reverse. Reversing the alias
+  // mask is not correct (or necessary). When combined with tail-folding the ALM
+  // should only be reversed where the alias-mask is true.
+  if (Stride < 0)
+    disablePartialAliasMaskingIfEnabled();
+
   // If the instruction is a store located in a predicated block, it will be
   // scalarized.
   if (isScalarWithPredication(I, VF))
@@ -3608,6 +3666,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return FixedScalableVFPair::getNone();
   }
 
+  checkIfPartialAliasMaskingIsEnabled();
+
   switch (ScalarEpilogueStatus) {
   case CM_ScalarEpilogueAllowed:
     return computeFeasibleMaxVF(MaxTC, UserVF, UserIC, false);
@@ -4446,6 +4506,13 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
     return Result;
   }
 
+  if (CM.maskPartialAliasing()) {
+    LLVM_DEBUG(
+        dbgs()
+        << "LEV: Epilogue vectorization not supported with alias masking");
+    return Result;
+  }
+
   // Not really a cost consideration, but check for unsupported cases here to
   // simplify the logic.
   if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
@@ -5724,7 +5791,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
           // stores.  Note that even with tail folding we know that at least
           // one lane is active (i.e. generalized predication is not possible
           // here), and the logic below depends on this fact.
-          if (!foldTailByMasking())
+          if (!allLoopBlocksMasked())
             return true;
 
           // For scalable vectors, a uniform memop load is always
@@ -6819,8 +6886,8 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.invalidateCostModelingDecisions();
   }
 
-  if (CM.foldTailByMasking())
-    Legal->prepareToFoldTailByMasking();
+  if (CM.allLoopBlocksMasked())
+    Legal->prepareToMaskLoop();
 
   ElementCount MaxUserVF =
       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
@@ -6932,7 +6999,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
     // TODO: Remove this code after stepping away from the legacy cost model and
     // adding code to simplify VPlans before calculating their costs.
     auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop);
-    if (TC == VF && !CM.foldTailByMasking())
+    if (TC == VF && !CM.allLoopBlocksMasked())
       addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
                                            CostCtx.SkipCostComputation);
 
@@ -7426,6 +7493,14 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // compactness.
   attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
 
+  VPValue *ClampedVF = nullptr;
+  if (CM.maskPartialAliasing()) {
+    ClampedVF = materializeAliasMask(
+        BestVPlan, *CM.Legal->getRuntimePointerChecking()->getDiffChecks(),
+        HasBranchWeights);
+    ++LoopsPartialAliasVectorized;
+  }
+
   // Retrieving VectorPH now when it's easier while VPlan still has Regions.
   VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
 
@@ -7462,6 +7537,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   VPlanTransforms::materializeVectorTripCount(
       BestVPlan, VectorPH, CM.foldTailByMasking(),
       CM.requiresScalarEpilogue(BestVF.isVector()));
+  VPlanTransforms::fixupVFUsersForClampedVF(BestVPlan, ClampedVF);
   VPlanTransforms::materializeFactors(BestVPlan, VectorPH, BestVF);
   VPlanTransforms::cse(BestVPlan);
   VPlanTransforms::simplifyRecipes(BestVPlan);
@@ -7480,7 +7556,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // Perform the actual loop transformation.
   VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
                          OrigLoop->getParentLoop(),
-                         Legal->getWidestInductionType());
+                         Legal->getWidestInductionType(), ClampedVF);
 
 #ifdef EXPENSIVE_CHECKS
   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
@@ -8231,7 +8307,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // ---------------------------------------------------------------------------
   // Predicate and linearize the top-level loop region.
   // ---------------------------------------------------------------------------
-  VPlanTransforms::introduceMasksAndLinearize(*Plan, CM.foldTailByMasking());
+  VPlanTransforms::introduceMasksAndLinearize(*Plan, CM.foldTailByMasking(),
+                                              CM.maskPartialAliasing());
 
   // ---------------------------------------------------------------------------
   // Construct wide recipes and apply predication for original scalar
@@ -8477,9 +8554,9 @@ void LoopVectorizationPlanner::addReductionResultComputation(
     // with fewer lanes than the VF. So the operands of the select would have
     // different numbers of lanes. Partial reductions mask the input instead.
     auto *RR = dyn_cast<VPReductionRecipe>(OrigExitingVPV->getDefiningRecipe());
-    if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
+    if (!PhiR->isInLoop() && CM.allLoopBlocksMasked() &&
         (!RR || !RR->isPartialReduction())) {
-      VPValue *Cond = vputils::findHeaderMask(*Plan);
+      VPValue *Cond = vputils::findLoopBodyMask(*Plan);
       VPIRFlags Flags = PhiTy->isFloatingPointTy()
                             ? VPIRFlags(RdxDesc.getFastMathFlags())
                             : VPIRFlags();
@@ -8682,6 +8759,21 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
   }
 }
 
+VPValue *LoopVectorizationPlanner::materializeAliasMask(
+    VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights) {
+  VPBasicBlock *MinVFCheck = Plan.createVPBasicBlock("vector.min.vf.check");
+  VPValue *ClampedVF = VPlanTransforms::materializeAliasMask(
+      Plan, MinVFCheck,
+      *CM.Legal->getRuntimePointerChecking()->getDiffChecks());
+  VPBuilder Builder(MinVFCheck);
+  Type *IVTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
+  // Check the "ClampedVF" from the alias mask contains at least two elements.
+  VPValue *Cond = Builder.createICmp(
+      CmpInst::ICMP_ULT, ClampedVF, Plan.getConstantInt(IVTy, 2), {}, "cmp.vf");
+  VPlanTransforms::attachCheckBlock(Plan, Cond, MinVFCheck, HasBranchWeights);
+  return ClampedVF;
+}
+
 void LoopVectorizationPlanner::addMinimumIterationCheck(
     VPlan &Plan, ElementCount VF, unsigned UF,
     ElementCount MinProfitableTripCount) const {
@@ -8794,7 +8886,8 @@ static bool processLoopInVPlanNativePath(
   VPlan &BestPlan = LVP.getPlanFor(VF.Width);
 
   {
-    GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
+    GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind,
+                             CM.maskPartialAliasing());
     InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
                            Checks, BestPlan);
     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -9643,7 +9736,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   if (ORE->allowExtraAnalysis(LV_NAME))
     LVP.emitInvalidCostRemarks(ORE);
 
-  GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
+  GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind,
+                           CM.maskPartialAliasing());
   if (LVP.hasPlanWithVF(VF.Width)) {
     // Select the interleave count.
     IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
@@ -9762,6 +9856,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     IC = 1;
   }
 
+  if (CM.maskPartialAliasing()) {
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: Not interleaving due to partial aliasing vectorization.\n");
+    IntDiagMsg = {
+        "PartialAliasingVectorization",
+        "Unable to interleave due to partial aliasing vectorization."};
+    InterleaveLoop = false;
+    IC = 1;
+  }
+
   // Emit diagnostic messages, if any.
   const char *VAPassName = Hints.vectorizeAnalysisPassName();
   if (!VectorizeLoop && !InterleaveLoop) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index e4b05a410d303..c89249ab3a8a2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -240,9 +240,11 @@ VPTransformState::VPTransformState(const TargetTransformInfo *TTI,
                                    ElementCount VF, LoopInfo *LI,
                                    DominatorTree *DT, AssumptionCache *AC,
                                    IRBuilderBase &Builder, VPlan *Plan,
-                                   Loop *CurrentParentLoop, Type *CanonicalIVTy)
+                                   Loop *CurrentParentLoop, Type *CanonicalIVTy,
+                                   VPValue *ClampedVF)
     : TTI(TTI), VF(VF), CFG(DT), LI(LI), AC(AC), Builder(Builder), Plan(Plan),
-      CurrentParentLoop(CurrentParentLoop), TypeAnalysis(*Plan), VPDT(*Plan) {}
+      CurrentParentLoop(CurrentParentLoop), TypeAnalysis(*Plan), VPDT(*Plan),
+      ClampedVF(ClampedVF) {}
 
 Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) {
   if (isa<VPIRValue, VPSymbolicValue>(Def))
@@ -1073,6 +1075,12 @@ void VPlan::printLiveIns(raw_ostream &O) const {
     O << " = vector-trip-count";
   }
 
+  if (AliasMask.getNumUsers() > 0) {
+    O << "\nLive-in ";
+    AliasMask.printAsOperand(O, SlotTracker);
+    O << " = alias-mask";
+  }
+
   if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
     O << "\nLive-in ";
     BackedgeTakenCount->printAsOperand(O, SlotTracker);
@@ -1203,6 +1211,7 @@ VPlan *VPlan::duplicate() {
   Old2NewVPValues[&VF] = &NewPlan->VF;
   Old2NewVPValues[&UF] = &NewPlan->UF;
   Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
+  Old2NewVPValues[&AliasMask] = &NewPlan->AliasMask;
   if (BackedgeTakenCount) {
     NewPlan->BackedgeTakenCount = new VPSymbolicValue();
     Old2NewVPValues[BackedgeTakenCount] = NewPlan->BackedgeTakenCount;
@@ -1496,6 +1505,8 @@ void VPSlotTracker::assignNames(const VPlan &Plan) {
   if (Plan.VFxUF.getNumUsers() > 0)
     assignName(&Plan.VFxUF);
   assignName(&Plan.VectorTripCount);
+  if (Plan.AliasMask.getNumUsers() > 0)
+    assignName(&Plan.AliasMask);
   if (Plan.BackedgeTakenCount)
     assignName(Plan.BackedgeTakenCount);
   for (VPValue *LI : Plan.getLiveIns())
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 63b09c69f54a3..5b337304f27c3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1210,8 +1210,9 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
     // part if it is scalar. In the latter case, the recipe will be removed
     // during unrolling.
     ExtractPenultimateElement,
-    LogicalAnd, // Non-poison propagating logical And.
-    LogicalOr,  // Non-poison propagating logical Or.
+    LogicalAnd,     // Non-poison propagating logical And.
+    LogicalOr,      // Non-poison propagating logical Or.
+    NumActiveLanes, // Counts the number of active lanes in a mask.
     // Add an offset in bytes (second operand) to a base pointer (first
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
@@ -4537,6 +4538,9 @@ class VPlan {
   /// Represents the loop-invariant VF * UF of the vector loop region.
   VPSymbolicValue VFxUF;
 
+  /// Represents the loop-invariant alias of the vector loop region.
+  VPSymbolicValue AliasMask;
+
   /// Contains all the external definitions created for this VPlan, as a mapping
   /// from IR Values to VPIRValues.
   SmallMapVector<Value *, VPIRValue *, 16> LiveIns;
@@ -4679,6 +4683,10 @@ class VPlan {
   /// Returns VF * UF of the vector loop region.
   VPValue &getVFxUF() { return VFxUF; }
 
+  /// Returns alias mask of the vector loop region.
+  VPValue &getAliasMask() { return AliasMask; }
+  const VPValue &getAliasMask() const { return AliasMask; }
+
   LLVMContext &getContext() const {
     return getScalarHeader()->getIRBasicBlock()->getContext();
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 35f4f7c1a48ec..40a9e69589efd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -24,7 +24,8 @@ using namespace VPlanPatternMatch;
 
 #define DEBUG_TYPE "vplan"
 
-VPTypeAnalysis::VPTypeAnalysis(const VPlan &Plan) : Ctx(Plan.getContext()) {
+VPTypeAnalysis::VPTypeAnalysis(const VPlan &Plan)
+    : Ctx(Plan.getContext()), Plan(Plan) {
   if (auto LoopRegion = Plan.getVectorLoopRegion()) {
     if (const auto *CanIV = dyn_cast<VPCanonicalIVPHIRecipe>(
             &LoopRegion->getEntryBasicBlock()->front())) {
@@ -147,6 +148,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
     return inferScalarType(R->getOperand(0));
   case Instruction::ExtractValue:
     return cast<ExtractValueInst>(R->getUnderlyingValue())->getType();
+  case VPInstruction::NumActiveLanes:
+    return Type::getInt64Ty(Ctx);
   default:
     break;
   }
@@ -277,8 +280,12 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
     return IRV->getType();
 
   if (isa<VPSymbolicValue>(V)) {
-    // All VPValues without any underlying IR value (like the vector trip count
-    // or the backedge-taken count) have the same type as the canonical IV.
+    if (V == &Plan.getAliasMask())
+      return IntegerType::getInt1Ty(Ctx);
+
+    // All other VPValues without any underlying IR value (like the vector trip
+    // count or the backedge-taken count) have the same type as the canonical
+    // IV.
     return CanonicalIVTy;
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index dc4be4270f7f1..c268a7f22e339 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -46,6 +46,7 @@ class VPTypeAnalysis {
   /// count).
   Type *CanonicalIVTy;
   LLVMContext &Ctx;
+  const VPlan &Plan;
 
   Type *inferScalarTypeForRecipe(const VPBlendRecipe *R);
   Type *inferScalarTypeForRecipe(const VPInstruction *R);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 80847fcbb77fb..def3685f565b8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1020,13 +1020,19 @@ static void addBypassBranch(VPlan &Plan, VPBasicBlock *CheckBlockVPBB,
   }
 }
 
+void VPlanTransforms::attachCheckBlock(VPlan &Plan, VPValue *Cond,
+                                       VPBasicBlock *CheckBlock,
+                                       bool AddBranchWeights) {
+  insertCheckBlockBeforeVectorLoop(Plan, CheckBlock);
+  addBypassBranch(Plan, CheckBlock, Cond, AddBranchWeights);
+}
+
 void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
                                        BasicBlock *CheckBlock,
                                        bool AddBranchWeights) {
   VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
   VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock);
-  insertCheckBlockBeforeVectorLoop(Plan, CheckBlockVPBB);
-  addBypassBranch(Plan, CheckBlockVPBB, CondVPV, AddBranchWeights);
+  attachCheckBlock(Plan, CondVPV, CheckBlockVPBB, AddBranchWeights);
 }
 
 void VPlanTransforms::addMinimumIterationCheck(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 113ca8c4d0f7c..e42702ba403ed 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -200,7 +200,7 @@ struct VPTransformState {
   VPTransformState(const TargetTransformInfo *TTI, ElementCount VF,
                    LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
                    IRBuilderBase &Builder, VPlan *Plan, Loop *CurrentParentLoop,
-                   Type *CanonicalIVTy);
+                   Type *CanonicalIVTy, VPValue *ClampedVF);
   /// Target Transform Info.
   const TargetTransformInfo *TTI;
 
@@ -278,6 +278,13 @@ struct VPTransformState {
     Iter->second[CacheIdx] = V;
   }
 
+  /// Returns the runtime clamped VF (or nullptr if the VF is not clamped).
+  Value *getRTClampedVF() {
+    if (!ClampedVF)
+      return nullptr;
+    return get(ClampedVF, /*IsScalar=*/true);
+  }
+
   /// Set the debug location in the builder using the debug location \p DL.
   void setDebugLocFrom(DebugLoc DL);
 
@@ -332,6 +339,9 @@ struct VPTransformState {
 
   /// VPlan-based dominator tree.
   VPDominatorTree VPDT;
+
+  /// Runtime clamped VF value.
+  VPValue *ClampedVF = nullptr;
 };
 
 /// Struct to hold various analysis needed for cost computations.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index dbc2e71c785ee..38a6eaf00db1b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -74,7 +74,8 @@ class VPPredicator {
   }
 
   /// Compute and return the mask for the vector loop header block.
-  void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail);
+  void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail,
+                        bool MaskAliasing);
 
   /// Compute the predicate of \p VPBB, assuming that the header block of the
   /// loop is set to True, or to the loop mask when tail folding.
@@ -153,25 +154,38 @@ void VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
   setBlockInMask(VPBB, BlockMask);
 }
 
-void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) {
-  if (!FoldTail) {
+void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail,
+                                    bool MaskAliasing) {
+  if (!FoldTail && !MaskAliasing) {
     setBlockInMask(HeaderVPBB, nullptr);
     return;
   }
 
-  // Introduce the early-exit compare IV <= BTC to form header block mask.
-  // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
-  // constructing the desired canonical IV in the header block as its first
-  // non-phi instructions.
-
+  VPValue *BlockMask = nullptr;
   auto &Plan = *HeaderVPBB->getPlan();
-  auto *IV =
-      new VPWidenCanonicalIVRecipe(HeaderVPBB->getParent()->getCanonicalIV());
-  Builder.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
-  Builder.insert(IV);
 
-  VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
-  VPValue *BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
+  if (FoldTail) {
+    // Introduce the early-exit compare IV <= BTC to form header block mask.
+    // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
+    // constructing the desired canonical IV in the header block as its first
+    // non-phi instructions.
+
+    auto *IV =
+        new VPWidenCanonicalIVRecipe(HeaderVPBB->getParent()->getCanonicalIV());
+    Builder.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
+    Builder.insert(IV);
+
+    VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
+    BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
+  }
+
+  if (MaskAliasing) {
+    if (BlockMask)
+      BlockMask = Builder.createAnd(BlockMask, &Plan.getAliasMask());
+    else
+      BlockMask = &Plan.getAliasMask();
+  }
+
   setBlockInMask(HeaderVPBB, BlockMask);
 }
 
@@ -265,7 +279,8 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
   }
 }
 
-void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
+void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail,
+                                                 bool MaskAliasing) {
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
@@ -280,7 +295,7 @@ void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
     // convert all phi recipes of VPBB to blend recipes unless VPBB is the
     // header.
     if (VPBB == Header) {
-      Predicator.createHeaderMask(Header, FoldTail);
+      Predicator.createHeaderMask(Header, FoldTail, MaskAliasing);
     } else {
       Predicator.createBlockInMask(VPBB);
       Predicator.convertPhisToBlends(VPBB);
@@ -314,11 +329,11 @@ void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
     PrevVPBB = VPBB;
   }
 
-  // If we folded the tail and introduced a header mask, any extract of the
-  // last element must be updated to extract from the last active lane of the
-  // header mask instead (i.e., the lane corresponding to the last active
-  // iteration).
-  if (FoldTail) {
+  // If we folded the  and introduced a header mask, or have partial alias
+  // masking, any extract of the last element must be updated to extract from
+  // the last active lane of the header mask instead (i.e., the lane
+  // corresponding to the last active iteration).
+  if (FoldTail || MaskAliasing) {
     assert(Plan.getExitBlocks().size() == 1 &&
            "only a single-exit block is supported currently");
     assert(Plan.getExitBlocks().front()->getSinglePredecessor() ==
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 362c5b0353c7f..42d0a5355ef64 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -458,6 +458,7 @@ unsigned VPInstruction::getNumOperandsForOpcode() const {
   case VPInstruction::ResumeForEpilogue:
   case VPInstruction::Reverse:
   case VPInstruction::Unpack:
+  case VPInstruction::NumActiveLanes:
     return 1;
   case Instruction::ICmp:
   case Instruction::FCmp:
@@ -607,6 +608,20 @@ Value *VPInstruction::generate(VPTransformState &State) {
                                    {PredTy, ScalarTC->getType()},
                                    {VIVElem0, ScalarTC}, nullptr, Name);
   }
+  case VPInstruction::NumActiveLanes: {
+    Value *Op = State.get(getOperand(0));
+    auto *VecTy = cast<VectorType>(Op->getType());
+    assert(VecTy->getScalarSizeInBits() == 1 &&
+           "NumActiveLanes only implemented for i1 vectors");
+
+    Value *ZExt = Builder.CreateCast(
+        Instruction::ZExt, Op,
+        VectorType::get(Builder.getInt32Ty(), VecTy->getElementCount()));
+    Value *Count =
+        Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, ZExt);
+    return Builder.CreateCast(Instruction::ZExt, Count, Builder.getInt64Ty(),
+                              "num.active.lanes");
+  }
   case VPInstruction::FirstOrderRecurrenceSplice: {
     // Generate code to combine the previous and current values in vector v3.
     //
@@ -628,8 +643,12 @@ Value *VPInstruction::generate(VPTransformState &State) {
   }
   case VPInstruction::CalculateTripCountMinusVF: {
     unsigned UF = getParent()->getPlan()->getConcreteUF();
+    assert(UF == 1 || !State.ClampedVF && "Expected UF == 1 with ClampedVF");
+    Value *ClampedVF = State.getRTClampedVF();
     Value *ScalarTC = State.get(getOperand(0), VPLane(0));
-    Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
+    Value *Step =
+        ClampedVF ? ClampedVF
+                  : createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
     Value *Sub = Builder.CreateSub(ScalarTC, Step);
     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
     Value *Zero = ConstantInt::getNullValue(ScalarTC->getType());
@@ -655,9 +674,13 @@ Value *VPInstruction::generate(VPTransformState &State) {
     unsigned Part = getUnrollPart(*this);
     auto *IV = State.get(getOperand(0), VPLane(0));
     assert(Part != 0 && "Must have a positive part");
+    Value *ClampedVF = State.getRTClampedVF();
+    assert(!ClampedVF || Part == 1 && "Expected Part == 1 with ClampedVF");
     // The canonical IV is incremented by the vectorization factor (num of
     // SIMD elements) times the unroll part.
-    Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
+    Value *Step = ClampedVF
+                      ? ClampedVF
+                      : createStepForVF(Builder, IV->getType(), State.VF, Part);
     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
                              hasNoSignedWrap());
   }
@@ -1272,7 +1295,8 @@ bool VPInstruction::isVectorToScalar() const {
          getOpcode() == VPInstruction::ComputeAnyOfResult ||
          getOpcode() == VPInstruction::ExtractLastActive ||
          getOpcode() == VPInstruction::ComputeReductionResult ||
-         getOpcode() == VPInstruction::AnyOf;
+         getOpcode() == VPInstruction::AnyOf ||
+         getOpcode() == VPInstruction::NumActiveLanes;
 }
 
 bool VPInstruction::isSingleScalar() const {
@@ -1546,6 +1570,9 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ExtractLastActive:
     O << "extract-last-active";
     break;
+  case VPInstruction::NumActiveLanes:
+    O << "num-active-lanes";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index bc9fe1eb81416..5cf76d6aa6f13 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5065,6 +5065,7 @@ void VPlanTransforms::materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH,
   Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
   VPValue &VF = Plan.getVF();
   VPValue &VFxUF = Plan.getVFxUF();
+
   // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
   // used.
   // TODO: Assert that they aren't used.
@@ -5097,6 +5098,60 @@ void VPlanTransforms::materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH,
   VFxUF.replaceAllUsesWith(MulByUF);
 }
 
+VPValue *
+VPlanTransforms::materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheck,
+                                      ArrayRef<PointerDiffInfo> DiffChecks) {
+  VPValue &AliasMask = Plan.getAliasMask();
+  VPBuilder Builder(AliasCheck, AliasCheck->begin());
+  Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
+  Type *I64Ty = IntegerType::getInt64Ty(Plan.getContext());
+  Type *PtrTy = PointerType::getUnqual(Plan.getContext());
+
+  VPValue *Mask = nullptr;
+  for (PointerDiffInfo Check : DiffChecks) {
+    VPValue *Src = vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SrcStart);
+    VPValue *Sink =
+        vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SinkStart);
+
+    VPValue *SrcPtr =
+        Builder.createScalarCast(Instruction::CastOps::IntToPtr, Src, PtrTy,
+                                 DebugLoc::getCompilerGenerated());
+    VPValue *SinkPtr =
+        Builder.createScalarCast(Instruction::CastOps::IntToPtr, Sink, PtrTy,
+                                 DebugLoc::getCompilerGenerated());
+
+    VPWidenIntrinsicRecipe *WARMask = new VPWidenIntrinsicRecipe(
+        Intrinsic::loop_dependence_war_mask,
+        {SrcPtr, SinkPtr, Plan.getConstantInt(I64Ty, Check.AccessSize)}, I1Ty);
+    Builder.insert(WARMask);
+
+    if (Mask)
+      Mask = Builder.createAnd(Mask, WARMask);
+    else
+      Mask = WARMask;
+  }
+
+  // Replace all users of the symbolic alias-mask with the materialized value.
+  AliasMask.replaceAllUsesWith(Mask);
+
+  Type *IVTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
+  VPValue *NumActive =
+      Builder.createNaryOp(VPInstruction::NumActiveLanes, {Mask});
+  return Builder.createScalarZExtOrTrunc(NumActive, IVTy, I64Ty,
+                                         DebugLoc::getCompilerGenerated());
+}
+
+void VPlanTransforms::fixupVFUsersForClampedVF(VPlan &Plan,
+                                               VPValue *ClampedVF) {
+  if (!ClampedVF)
+    return;
+
+  assert(Plan.getConcreteUF() == 1 &&
+         "Clamped VF not support with interleaving");
+  Plan.getVF().replaceAllUsesWith(ClampedVF);
+  Plan.getVFxUF().replaceAllUsesWith(ClampedVF);
+}
+
 DenseMap<const SCEV *, Value *>
 VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
   SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index b76fde2bd1217..bb5e7d0ac043b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -174,6 +174,8 @@ struct VPlanTransforms {
   /// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
   /// VPValue and connect the block to \p Plan, using the VPValue as branch
   /// condition.
+  static void attachCheckBlock(VPlan &Plan, VPValue *Cond,
+                               VPBasicBlock *CheckBlock, bool AddBranchWeights);
   static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock,
                                bool AddBranchWeights);
 
@@ -418,6 +420,12 @@ struct VPlanTransforms {
   static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH,
                                  ElementCount VF);
 
+  static VPValue *materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheck,
+                                       ArrayRef<PointerDiffInfo> DiffChecks);
+
+  /// Replaces all users of the VF and VFxUF with the runtime clamped VF.
+  static void fixupVFUsersForClampedVF(VPlan &Plan, VPValue *ClampedVF);
+
   /// Expand VPExpandSCEVRecipes in \p Plan's entry block. Each
   /// VPExpandSCEVRecipe is replaced with a live-in wrapping the expanded IR
   /// value. A mapping from SCEV expressions to their expanded IR value is
@@ -443,7 +451,8 @@ struct VPlanTransforms {
   /// Predicate and linearize the control-flow in the only loop region of
   /// \p Plan. If \p FoldTail is true, create a mask guarding the loop
   /// header, otherwise use all-true for the header mask.
-  static void introduceMasksAndLinearize(VPlan &Plan, bool FoldTail);
+  static void introduceMasksAndLinearize(VPlan &Plan, bool FoldTail,
+                                         bool MaskAliasing);
 
   /// Add branch weight metadata, if the \p Plan's middle block is terminated by
   /// a BranchOnCond recipe.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index f5318bb1c6515..985c0254ca148 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -46,11 +46,20 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) {
   if (U && !isa<Instruction>(U->getValue()))
     return Plan.getOrAddLiveIn(U->getValue());
   auto *Expanded = new VPExpandSCEVRecipe(Expr);
-  Plan.getEntry()->appendRecipe(Expanded);
+  VPBasicBlock *EntryVPBB = Plan.getEntry();
+  Plan.getEntry()->insert(Expanded, EntryVPBB->getFirstNonPhi());
   return Expanded;
 }
 
 bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) {
+  if (V == &Plan.getAliasMask())
+    return true;
+
+  VPValue *Mask;
+  if (match(V,
+            m_c_BinaryAnd(m_VPValue(Mask), m_Specific(&Plan.getAliasMask()))))
+    V = Mask;
+
   if (isa<VPActiveLaneMaskPHIRecipe>(V))
     return true;
 
@@ -606,9 +615,30 @@ VPSingleDefRecipe *vputils::findHeaderMask(VPlan &Plan) {
       HeaderMask = VPI;
     }
   }
+
   return HeaderMask;
 }
 
+VPValue *vputils::findLoopBodyMask(VPlan &Plan) {
+  VPValue *LoopMask = findHeaderMask(Plan);
+
+  // If an alias-mask is in use, ensure that it included in the loop mask.
+  VPValue *AliasMask = &Plan.getAliasMask();
+  if (AliasMask->getNumUsers() > 0) {
+    if (LoopMask) {
+      assert(AliasMask->hasOneUse() &&
+             "expected one use (`loop-mask = and alias-mask, lane-mask`)");
+      auto *VPI = dyn_cast<VPInstruction>(AliasMask->getSingleUser());
+      if (vputils::isHeaderMask(VPI, Plan))
+        LoopMask = VPI;
+    } else {
+      LoopMask = AliasMask;
+    }
+  }
+
+  return LoopMask;
+}
+
 bool VPBlockUtils::isHeader(const VPBlockBase *VPB,
                             const VPDominatorTree &VPDT) {
   auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index a5692699d9d76..c087619f2b02d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -151,6 +151,10 @@ VPInstruction *findComputeReductionResult(VPReductionPHIRecipe *PhiR);
 /// the header-mask pattern manually.
 VPSingleDefRecipe *findHeaderMask(VPlan &Plan);
 
+/// Finds the mask for the loop body. This differs from `findHeaderMask` as it
+/// will include the alias-mask (if present).
+VPValue *findLoopBodyMask(VPlan &Plan);
+
 } // namespace vputils
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias-mask.ll
new file mode 100644
index 0000000000000..078a6e1c2dbd9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias-mask.ll
@@ -0,0 +1,689 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" --version 5
+; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -force-partial-aliasing-vectorization %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -force-partial-aliasing-vectorization -prefer-predicate-over-epilogue=predicate-dont-vectorize %s | FileCheck %s --check-prefix=CHECK-TF
+
+define void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define void @alias_mask(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[B3:%.*]] = ptrtoaddr ptr [[B]] to i64
+; CHECK-NEXT:    [[C2:%.*]] = ptrtoaddr ptr [[C]] to i64
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MIN_VF_CHECK:.*]]
+; CHECK:       [[VECTOR_MIN_VF_CHECK]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[B3]] to ptr
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[C2]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP12]], ptr [[TMP9]], i64 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP8]])
+; CHECK-NEXT:    [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[CMP_VF:%.*]] = icmp ult i64 [[NUM_ACTIVE_LANES]], 2
+; CHECK-NEXT:    br i1 [[CMP_VF]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[NUM_ACTIVE_LANES]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP7]], <vscale x 16 x i1> [[ALIAS_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> [[ALIAS_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP14:%.*]] = select <vscale x 16 x i1> [[ALIAS_LANE_MASK]], <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], <vscale x 16 x i8> splat (i8 1)
+; CHECK-NEXT:    [[TMP10:%.*]] = sdiv <vscale x 16 x i8> [[WIDE_MASKED_LOAD3]], [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP10]], ptr align 1 [[TMP15]], <vscale x 16 x i1> [[ALIAS_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[NUM_ACTIVE_LANES]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+; CHECK-TF-LABEL: define void @alias_mask(
+; CHECK-TF-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-TF-NEXT:  [[ENTRY:.*:]]
+; CHECK-TF-NEXT:    [[B2:%.*]] = ptrtoaddr ptr [[B]] to i64
+; CHECK-TF-NEXT:    [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64
+; CHECK-TF-NEXT:    [[CMP11:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-TF-NEXT:    br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]]
+; CHECK-TF:       [[FOR_BODY_PREHEADER]]:
+; CHECK-TF-NEXT:    br label %[[VECTOR_MIN_VF_CHECK:.*]]
+; CHECK-TF:       [[VECTOR_MIN_VF_CHECK]]:
+; CHECK-TF-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[B2]] to ptr
+; CHECK-TF-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[C1]] to ptr
+; CHECK-TF-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP1]], ptr [[TMP2]], i64 1)
+; CHECK-TF-NEXT:    [[TMP4:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i32>
+; CHECK-TF-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP4]])
+; CHECK-TF-NEXT:    [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-TF-NEXT:    [[CMP_VF:%.*]] = icmp ult i64 [[NUM_ACTIVE_LANES]], 2
+; CHECK-TF-NEXT:    br i1 [[CMP_VF]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-TF:       [[VECTOR_PH]]:
+; CHECK-TF-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[NUM_ACTIVE_LANES]]
+; CHECK-TF-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[NUM_ACTIVE_LANES]]
+; CHECK-TF-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
+; CHECK-TF-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-TF:       [[VECTOR_BODY]]:
+; CHECK-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[TMP10:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]]
+; CHECK-TF-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], <vscale x 16 x i1> [[TMP10]], <vscale x 16 x i8> poison)
+; CHECK-TF-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], <vscale x 16 x i1> [[TMP10]], <vscale x 16 x i8> poison)
+; CHECK-TF-NEXT:    [[TMP13:%.*]] = select <vscale x 16 x i1> [[TMP10]], <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], <vscale x 16 x i8> splat (i8 1)
+; CHECK-TF-NEXT:    [[TMP14:%.*]] = sdiv <vscale x 16 x i8> [[WIDE_MASKED_LOAD3]], [[TMP13]]
+; CHECK-TF-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP14]], ptr align 1 [[TMP15]], <vscale x 16 x i1> [[TMP10]])
+; CHECK-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[NUM_ACTIVE_LANES]]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
+; CHECK-TF-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-TF-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP16]], true
+; CHECK-TF-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-TF:       [[MIDDLE_BLOCK]]:
+; CHECK-TF-NEXT:    br [[EXIT_LOOPEXIT:label %.*]]
+; CHECK-TF:       [[SCALAR_PH]]:
+;
+
+entry:
+  %cmp11 = icmp sgt i64 %n, 0
+  br i1 %cmp11, label %for.body, label %exit
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %div = sdiv i8 %load.b, %load.a
+  %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv
+  store i8 %div, ptr %gep.c, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                 ; preds = %for.body, %entry
+  ret void
+}
+
+; Note: This test could emit a `llvm.loop.dependence.raw` mask to avoid creating
+; a dependency between the store and the load, but it is not necessary for
+; correctness.
+define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define i32 @alias_mask_read_after_write(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[C2:%.*]] = ptrtoaddr ptr [[C]] to i64
+; CHECK-NEXT:    [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
+; CHECK-NEXT:    [[CMP19:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP19]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MIN_VF_CHECK:.*]]
+; CHECK:       [[VECTOR_MIN_VF_CHECK]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[C2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[B1]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.war.mask.nxv4i1(ptr [[TMP9]], ptr [[TMP4]], i64 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 4 x i1> [[ALIAS_LANE_MASK]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[CMP_VF:%.*]] = icmp ult i64 [[NUM_ACTIVE_LANES]], 2
+; CHECK-NEXT:    br i1 [[CMP_VF]], label %[[SCALAR_PH]], label %[[VECTOR_PH1:.*]]
+; CHECK:       [[VECTOR_PH1]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[NUM_ACTIVE_LANES]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP7]], <vscale x 4 x i1> [[ALIAS_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr align 2 [[TMP8]], <vscale x 4 x i1> [[ALIAS_LANE_MASK]])
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP15]], <vscale x 4 x i1> [[ALIAS_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[WIDE_MASKED_LOAD3]]
+; CHECK-NEXT:    [[TMP12]] = select <vscale x 4 x i1> [[ALIAS_LANE_MASK]], <vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[NUM_ACTIVE_LANES]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+; CHECK-TF-LABEL: define i32 @alias_mask_read_after_write(
+; CHECK-TF-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-TF-NEXT:  [[ENTRY:.*:]]
+; CHECK-TF-NEXT:    [[C2:%.*]] = ptrtoaddr ptr [[C]] to i64
+; CHECK-TF-NEXT:    [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
+; CHECK-TF-NEXT:    [[CMP19:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-TF-NEXT:    br i1 [[CMP19]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]]
+; CHECK-TF:       [[FOR_BODY_PREHEADER]]:
+; CHECK-TF-NEXT:    br label %[[VECTOR_MIN_VF_CHECK:.*]]
+; CHECK-TF:       [[VECTOR_MIN_VF_CHECK]]:
+; CHECK-TF-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[C2]] to ptr
+; CHECK-TF-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[B1]] to ptr
+; CHECK-TF-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.war.mask.nxv4i1(ptr [[TMP1]], ptr [[TMP2]], i64 4)
+; CHECK-TF-NEXT:    [[TMP4:%.*]] = zext <vscale x 4 x i1> [[ALIAS_LANE_MASK]] to <vscale x 4 x i32>
+; CHECK-TF-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP4]])
+; CHECK-TF-NEXT:    [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-TF-NEXT:    [[CMP_VF:%.*]] = icmp ult i64 [[NUM_ACTIVE_LANES]], 2
+; CHECK-TF-NEXT:    br i1 [[CMP_VF]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-TF:       [[VECTOR_PH]]:
+; CHECK-TF-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[NUM_ACTIVE_LANES]]
+; CHECK-TF-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[NUM_ACTIVE_LANES]]
+; CHECK-TF-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-TF-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-TF:       [[VECTOR_BODY]]:
+; CHECK-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[TMP10:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]]
+; CHECK-TF-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> poison)
+; CHECK-TF-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr align 2 [[TMP12]], <vscale x 4 x i1> [[TMP10]])
+; CHECK-TF-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP13]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> poison)
+; CHECK-TF-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-TF-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP14]], [[WIDE_MASKED_LOAD3]]
+; CHECK-TF-NEXT:    [[TMP16]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> [[TMP15]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[NUM_ACTIVE_LANES]]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
+; CHECK-TF-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-TF-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP17]], true
+; CHECK-TF-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-TF:       [[MIDDLE_BLOCK]]:
+; CHECK-TF-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]])
+; CHECK-TF-NEXT:    br [[EXIT_LOOPEXIT:label %.*]]
+; CHECK-TF:       [[SCALAR_PH]]:
+;
+
+
+entry:
+  %cmp19 = icmp sgt i64 %n, 0
+  br i1 %cmp19, label %for.body, label %exit
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add2, %for.body ]
+  %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv
+  %load.a = load i32, ptr %gep.a, align 2
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %load.a, ptr %gep.c, align 2
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %load.b = load i32, ptr %gep.b, align 2
+  %add = add i32 %load.a, %accum
+  %add2 = add i32 %add, %load.b
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                        ; preds = %entry, %for.body
+  %result = phi i32 [ 0, %entry ], [ %add2, %for.body ]
+  ret i32 %result
+}
+
+define void @alias_mask_multiple(ptr %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define void @alias_mask_multiple(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[A6:%.*]] = ptrtoaddr ptr [[A]] to i64
+; CHECK-NEXT:    [[B3:%.*]] = ptrtoaddr ptr [[B]] to i64
+; CHECK-NEXT:    [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MIN_VF_CHECK:.*]]
+; CHECK:       [[VECTOR_MIN_VF_CHECK]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[A6]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[C1]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK0:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP9]], ptr [[TMP10]], i64 1)
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[B3]] to ptr
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[C1]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK1:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP13]], ptr [[TMP14]], i64 1)
+; CHECK-NEXT:    [[TMP15:%.*]] = and <vscale x 16 x i1> [[ALIAS_LANE_MASK0]], [[ALIAS_LANE_MASK1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <vscale x 16 x i1> [[TMP15]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP16]])
+; CHECK-NEXT:    [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[CMP_VF:%.*]] = icmp ult i64 [[NUM_ACTIVE_LANES]], 2
+; CHECK-NEXT:    br i1 [[CMP_VF]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[NUM_ACTIVE_LANES]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP18:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD4]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP18]], ptr align 1 [[TMP19]], <vscale x 16 x i1> [[TMP15]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[NUM_ACTIVE_LANES]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+; CHECK-TF-LABEL: define void @alias_mask_multiple(
+; CHECK-TF-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-TF-NEXT:  [[ENTRY:.*:]]
+; CHECK-TF-NEXT:    [[A7:%.*]] = ptrtoaddr ptr [[A]] to i64
+; CHECK-TF-NEXT:    [[B3:%.*]] = ptrtoaddr ptr [[B]] to i64
+; CHECK-TF-NEXT:    [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64
+; CHECK-TF-NEXT:    [[CMP11:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-TF-NEXT:    br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]]
+; CHECK-TF:       [[FOR_BODY_PREHEADER]]:
+; CHECK-TF-NEXT:    br label %[[VECTOR_MIN_VF_CHECK:.*]]
+; CHECK-TF:       [[VECTOR_MIN_VF_CHECK]]:
+; CHECK-TF-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[A7]] to ptr
+; CHECK-TF-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[C1]] to ptr
+; CHECK-TF-NEXT:    [[ALIAS_LANE_MASK0:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP2]], ptr [[TMP3]], i64 1)
+; CHECK-TF-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[B3]] to ptr
+; CHECK-TF-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[C1]] to ptr
+; CHECK-TF-NEXT:    [[ALIAS_LANE_MASK1:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP5]], ptr [[TMP6]], i64 1)
+; CHECK-TF-NEXT:    [[TMP8:%.*]] = and <vscale x 16 x i1> [[ALIAS_LANE_MASK0]], [[ALIAS_LANE_MASK1]]
+; CHECK-TF-NEXT:    [[TMP7:%.*]] = zext <vscale x 16 x i1> [[TMP8]] to <vscale x 16 x i32>
+; CHECK-TF-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP7]])
+; CHECK-TF-NEXT:    [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP9]] to i64
+; CHECK-TF-NEXT:    [[CMP_VF:%.*]] = icmp ult i64 [[NUM_ACTIVE_LANES]], 2
+; CHECK-TF-NEXT:    br i1 [[CMP_VF]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-TF:       [[VECTOR_PH]]:
+; CHECK-TF-NEXT:    [[TMP14:%.*]] = sub i64 [[N]], [[NUM_ACTIVE_LANES]]
+; CHECK-TF-NEXT:    [[TMP15:%.*]] = icmp ugt i64 [[N]], [[NUM_ACTIVE_LANES]]
+; CHECK-TF-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
+; CHECK-TF-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-TF:       [[VECTOR_BODY]]:
+; CHECK-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[TMP17:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[TMP8]]
+; CHECK-TF-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP18]], <vscale x 16 x i1> [[TMP17]], <vscale x 16 x i8> poison)
+; CHECK-TF-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP19]], <vscale x 16 x i1> [[TMP17]], <vscale x 16 x i8> poison)
+; CHECK-TF-NEXT:    [[TMP20:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD8]], [[WIDE_MASKED_LOAD]]
+; CHECK-TF-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP20]], ptr align 1 [[TMP21]], <vscale x 16 x i1> [[TMP17]])
+; CHECK-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[NUM_ACTIVE_LANES]]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP16]])
+; CHECK-TF-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-TF-NEXT:    [[TMP23:%.*]] = xor i1 [[TMP22]], true
+; CHECK-TF-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-TF:       [[MIDDLE_BLOCK]]:
+; CHECK-TF-NEXT:    br [[EXIT_LOOPEXIT:label %.*]]
+; CHECK-TF:       [[SCALAR_PH]]:
+;
+
+entry:
+  %cmp11 = icmp sgt i64 %n, 0
+  br i1 %cmp11, label %for.body, label %exit
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %add = add i8 %load.b, %load.a
+  %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv
+  store i8 %add, ptr %gep.c, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                 ; preds = %for.body, %entry
+  ret void
+}
+
+; Checks using a scalar outside the loop, with requires extracting the last
+; active element.
+define i8 @alias_masking_exit_value(ptr %ptrA, ptr %ptrB) {
+; CHECK-LABEL: define i8 @alias_masking_exit_value(
+; CHECK-SAME: ptr [[PTRA:%.*]], ptr [[PTRB:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[PTRA2:%.*]] = ptrtoaddr ptr [[PTRA]] to i64
+; CHECK-NEXT:    [[PTRB1:%.*]] = ptrtoaddr ptr [[PTRB]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 1000, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MIN_VF_CHECK:.*]]
+; CHECK:       [[VECTOR_MIN_VF_CHECK]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[PTRA2]] to ptr
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[PTRB1]] to ptr
+; CHECK-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP2]], ptr [[TMP3]], i64 1)
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP5]])
+; CHECK-NEXT:    [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[NUM_ACTIVE_LANES]] to i32
+; CHECK-NEXT:    [[CMP_VF:%.*]] = icmp ult i32 [[TMP7]], 2
+; CHECK-NEXT:    br i1 [[CMP_VF]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1000, [[TMP7]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 1000, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP7]] to i8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP9]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i8> [ [[TMP8]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[PTRA]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[PTRB]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP10]], <vscale x 16 x i1> [[ALIAS_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 16 x i8> [[VEC_IND]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP12]], ptr align 1 [[TMP11]], <vscale x 16 x i1> [[ALIAS_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP7]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i8> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1000, [[N_VEC]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 16 x i1> [[ALIAS_LANE_MASK]], splat (i1 true)
+; CHECK-NEXT:    [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP14]], i1 false)
+; CHECK-NEXT:    [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 16 x i8> [[TMP12]], i64 [[LAST_ACTIVE_LANE]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+; CHECK-TF-LABEL: define i8 @alias_masking_exit_value(
+; CHECK-TF-SAME: ptr [[PTRA:%.*]], ptr [[PTRB:%.*]]) #[[ATTR0]] {
+; CHECK-TF-NEXT:  [[ENTRY:.*:]]
+; CHECK-TF-NEXT:    [[PTRA2:%.*]] = ptrtoaddr ptr [[PTRA]] to i64
+; CHECK-TF-NEXT:    [[PTRB1:%.*]] = ptrtoaddr ptr [[PTRB]] to i64
+; CHECK-TF-NEXT:    br label %[[VECTOR_MIN_VF_CHECK:.*]]
+; CHECK-TF:       [[VECTOR_MIN_VF_CHECK]]:
+; CHECK-TF-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[PTRA2]] to ptr
+; CHECK-TF-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[PTRB1]] to ptr
+; CHECK-TF-NEXT:    [[ALIAS_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP0]], ptr [[TMP1]], i64 1)
+; CHECK-TF-NEXT:    [[TMP3:%.*]] = zext <vscale x 16 x i1> [[ALIAS_LANE_MASK]] to <vscale x 16 x i32>
+; CHECK-TF-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP3]])
+; CHECK-TF-NEXT:    [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP4]] to i64
+; CHECK-TF-NEXT:    [[TMP5:%.*]] = trunc i64 [[NUM_ACTIVE_LANES]] to i32
+; CHECK-TF-NEXT:    [[CMP_VF:%.*]] = icmp ult i32 [[TMP5]], 2
+; CHECK-TF-NEXT:    br i1 [[CMP_VF]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-TF:       [[VECTOR_PH]]:
+; CHECK-TF-NEXT:    [[TMP8:%.*]] = sub i32 1000, [[TMP5]]
+; CHECK-TF-NEXT:    [[TMP9:%.*]] = icmp ugt i32 1000, [[TMP5]]
+; CHECK-TF-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 0
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1000)
+; CHECK-TF-NEXT:    [[TMP11:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
+; CHECK-TF-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP5]] to i8
+; CHECK-TF-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP12]], i64 0
+; CHECK-TF-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-TF-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-TF:       [[VECTOR_BODY]]:
+; CHECK-TF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i8> [ [[TMP11]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[TMP13:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]]
+; CHECK-TF-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[PTRA]], i32 [[INDEX]]
+; CHECK-TF-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[PTRB]], i32 [[INDEX]]
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i8> poison)
+; CHECK-TF-NEXT:    [[TMP16:%.*]] = add <vscale x 16 x i8> [[VEC_IND]], [[WIDE_MASKED_LOAD]]
+; CHECK-TF-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP16]], ptr align 1 [[TMP15]], <vscale x 16 x i1> [[TMP13]])
+; CHECK-TF-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP5]]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP10]])
+; CHECK-TF-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-TF-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP17]], true
+; CHECK-TF-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i8> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-TF-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-TF:       [[MIDDLE_BLOCK]]:
+; CHECK-TF-NEXT:    [[TMP19:%.*]] = xor <vscale x 16 x i1> [[TMP13]], splat (i1 true)
+; CHECK-TF-NEXT:    [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP19]], i1 false)
+; CHECK-TF-NEXT:    [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1
+; CHECK-TF-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 16 x i8> [[TMP16]], i64 [[LAST_ACTIVE_LANE]]
+; CHECK-TF-NEXT:    br [[EXIT:label %.*]]
+; CHECK-TF:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gepA = getelementptr inbounds i8, ptr %ptrA, i32 %iv
+  %gepB = getelementptr inbounds i8, ptr %ptrB, i32 %iv
+  %loadA = load i8, ptr %gepA
+  %iv.trunc = trunc i32 %iv to i8
+  %add = add i8 %iv.trunc, %loadA
+  store i8 %add, ptr %gepB
+  %iv.next = add nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %exit.value = phi i8 [ %add, %loop ]
+  ret i8 %exit.value
+}
+
+; Unsupported: Reversing the alias mask is not correct.
+define void @alias_mask_reverse_iterate(ptr noalias %ptrA, ptr %ptrB, ptr %ptrC, i64 %n) {
+; CHECK-LABEL: define void @alias_mask_reverse_iterate(
+; CHECK-SAME: ptr noalias [[PTRA:%.*]], ptr [[PTRB:%.*]], ptr [[PTRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[VECTOR_PH:.*]]:
+; CHECK-NEXT:    [[PTRC2:%.*]] = ptrtoaddr ptr [[PTRC]] to i64
+; CHECK-NEXT:    [[PTRB1:%.*]] = ptrtoaddr ptr [[PTRB]] to i64
+; CHECK-NEXT:    [[IV_START:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[IV_START]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[PTRB1]], [[PTRC2]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 5
+; CHECK-NEXT:    [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[IV_START]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK3]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH1:.*]]
+; CHECK:       [[VECTOR_PH1]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 1
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[IV_START]], [[TMP8]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[IV_START]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[IV_START]], [[INDEX]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[PTRA]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sub nuw nsw i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP11]], -1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[TMP27]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 -1, [[TMP7]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[TMP27]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[TMP17]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[REVERSE5:%.*]] = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_LOAD4]])
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[PTRB]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i64 [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i64 [[TMP17]]
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[REVERSE8:%.*]] = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_LOAD6]])
+; CHECK-NEXT:    [[REVERSE9:%.*]] = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_LOAD7]])
+; CHECK-NEXT:    [[TMP32:%.*]] = add <vscale x 16 x i8> [[REVERSE8]], [[REVERSE]]
+; CHECK-NEXT:    [[TMP33:%.*]] = add <vscale x 16 x i8> [[REVERSE9]], [[REVERSE5]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[PTRC]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i64 [[TMP27]]
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i64 [[TMP17]]
+; CHECK-NEXT:    [[REVERSE10:%.*]] = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> [[TMP32]])
+; CHECK-NEXT:    [[REVERSE11:%.*]] = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> [[TMP33]])
+; CHECK-NEXT:    store <vscale x 16 x i8> [[REVERSE10]], ptr [[TMP39]], align 1
+; CHECK-NEXT:    store <vscale x 16 x i8> [[REVERSE11]], ptr [[TMP44]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP45]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[IV_START]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[SCALAR_PH:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[IV_START]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF11:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF12:%.*]] = urem i64 [[IV_START]], 8
+; CHECK-NEXT:    [[N_VEC13:%.*]] = sub i64 [[IV_START]], [[N_MOD_VF12]]
+; CHECK-NEXT:    [[TMP46:%.*]] = sub i64 [[IV_START]], [[N_VEC13]]
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX14:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT20:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX1:%.*]] = sub i64 [[IV_START]], [[INDEX14]]
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i8, ptr [[PTRA]], i64 [[OFFSET_IDX1]]
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i8, ptr [[TMP47]], i64 -7
+; CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <8 x i8>, ptr [[TMP49]], align 1
+; CHECK-NEXT:    [[REVERSE16:%.*]] = shufflevector <8 x i8> [[WIDE_LOAD15]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[PTRB]], i64 [[OFFSET_IDX1]]
+; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP50]], i64 -7
+; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x i8>, ptr [[TMP52]], align 1
+; CHECK-NEXT:    [[REVERSE18:%.*]] = shufflevector <8 x i8> [[WIDE_LOAD17]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP53:%.*]] = add <8 x i8> [[REVERSE18]], [[REVERSE16]]
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[PTRC]], i64 [[OFFSET_IDX1]]
+; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i64 -7
+; CHECK-NEXT:    [[REVERSE19:%.*]] = shufflevector <8 x i8> [[TMP53]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    store <8 x i8> [[REVERSE19]], ptr [[TMP56]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT20]] = add nuw i64 [[INDEX14]], 8
+; CHECK-NEXT:    [[TMP57:%.*]] = icmp eq i64 [[INDEX_NEXT20]], [[N_VEC13]]
+; CHECK-NEXT:    br i1 [[TMP57]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N21:%.*]] = icmp eq i64 [[IV_START]], [[N_VEC13]]
+; CHECK-NEXT:    br i1 [[CMP_N21]], label %[[SCALAR_PH]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP46]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], %[[VECTOR_MEMCHECK]] ], [ [[IV_START]], %[[VECTOR_PH]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[PTRA]], i64 [[IV]]
+; CHECK-NEXT:    [[LOADA:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[PTRB]], i64 [[IV]]
+; CHECK-NEXT:    [[LOADB:%.*]] = load i8, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOADB]], [[LOADA]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[PTRC]], i64 [[IV]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC]], label %[[SCALAR_PH]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    ret void
+;
+; CHECK-TF-LABEL: define void @alias_mask_reverse_iterate(
+; CHECK-TF-SAME: ptr noalias [[PTRA:%.*]], ptr [[PTRB:%.*]], ptr [[PTRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-TF-NEXT:  [[VECTOR_PH:.*:]]
+; CHECK-TF-NEXT:    [[PTRC2:%.*]] = ptrtoaddr ptr [[PTRC]] to i64
+; CHECK-TF-NEXT:    [[PTRB1:%.*]] = ptrtoaddr ptr [[PTRB]] to i64
+; CHECK-TF-NEXT:    [[IV_START:%.*]] = add i64 [[N]], -1
+; CHECK-TF-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-TF:       [[VECTOR_BODY]]:
+; CHECK-TF-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
+; CHECK-TF-NEXT:    [[TMP2:%.*]] = sub i64 [[PTRB1]], [[PTRC2]]
+; CHECK-TF-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; CHECK-TF-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH1:.*]], label %[[VECTOR_PH1:.*]]
+; CHECK-TF:       [[VECTOR_PH1]]:
+; CHECK-TF-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 4
+; CHECK-TF-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 4
+; CHECK-TF-NEXT:    [[TMP7:%.*]] = sub i64 [[IV_START]], [[TMP6]]
+; CHECK-TF-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[IV_START]], [[TMP6]]
+; CHECK-TF-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[IV_START]])
+; CHECK-TF-NEXT:    br label %[[VECTOR_BODY1:.*]]
+; CHECK-TF:       [[VECTOR_BODY1]]:
+; CHECK-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH1]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-TF-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[IV_START]], [[INDEX]]
+; CHECK-TF-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[PTRA]], i64 [[OFFSET_IDX]]
+; CHECK-TF-NEXT:    [[TMP11:%.*]] = sub nuw nsw i64 [[TMP4]], 1
+; CHECK-TF-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], -1
+; CHECK-TF-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP12]]
+; CHECK-TF-NEXT:    [[REVERSE:%.*]] = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP15]], <vscale x 16 x i1> [[REVERSE]], <vscale x 16 x i8> poison)
+; CHECK-TF-NEXT:    [[REVERSE3:%.*]] = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_LOAD]])
+; CHECK-TF-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[PTRB]], i64 [[OFFSET_IDX]]
+; CHECK-TF-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP12]]
+; CHECK-TF-NEXT:    [[REVERSE4:%.*]] = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP21]], <vscale x 16 x i1> [[REVERSE4]], <vscale x 16 x i8> poison)
+; CHECK-TF-NEXT:    [[REVERSE6:%.*]] = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_LOAD5]])
+; CHECK-TF-NEXT:    [[TMP22:%.*]] = add <vscale x 16 x i8> [[REVERSE6]], [[REVERSE3]]
+; CHECK-TF-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[PTRC]], i64 [[OFFSET_IDX]]
+; CHECK-TF-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[TMP26]], i64 [[TMP12]]
+; CHECK-TF-NEXT:    [[REVERSE7:%.*]] = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> [[TMP22]])
+; CHECK-TF-NEXT:    [[REVERSE8:%.*]] = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-TF-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[REVERSE7]], ptr align 1 [[TMP28]], <vscale x 16 x i1> [[REVERSE8]])
+; CHECK-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
+; CHECK-TF-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-TF-NEXT:    [[TMP30:%.*]] = xor i1 [[TMP29]], true
+; CHECK-TF-NEXT:    br i1 [[TMP30]], label %[[SCALAR_PH:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-TF:       [[SCALAR_PH]]:
+; CHECK-TF-NEXT:    br [[EXIT:label %.*]]
+; CHECK-TF:       [[SCALAR_PH1]]:
+;
+entry:
+  %iv.start = add nsw i64 %n, -1
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.start, %entry ], [ %iv.next, %loop ]
+  %gep.A = getelementptr inbounds i8, ptr %ptrA, i64 %iv
+  %loadA = load i8, ptr %gep.A, align 1
+  %gep.B = getelementptr inbounds i8, ptr %ptrB, i64 %iv
+  %loadB = load i8, ptr %gep.B, align 1
+  %add = add i8 %loadB, %loadA
+  %gep.C = getelementptr inbounds i8, ptr %ptrC, i64 %iv
+  store i8 %add, ptr %gep.C, align 1
+  %iv.next = add nsw i64 %iv, -1
+  %ec = icmp eq i64 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[PROF11]] = !{!"branch_weights", i32 8, i32 24}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]}
+;.
+; CHECK-TF: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-TF: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-TF: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-TF: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-TF: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-TF: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-TF: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing-alias-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing-alias-mask.ll
new file mode 100644
index 0000000000000..db64edf682732
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing-alias-mask.ll
@@ -0,0 +1,290 @@
+; REQUIRES: asserts
+; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -debug-only=loop-vectorize -mattr=+sve2 -passes=loop-vectorize -force-partial-aliasing-vectorization -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -debug-only=loop-vectorize -mattr=+sve2 -passes=loop-vectorize -force-partial-aliasing-vectorization -prefer-predicate-over-epilogue=predicate-dont-vectorize -disable-output %s 2>&1 | FileCheck %s --check-prefix=CHECK-TF
+
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: 'alias_mask'
+; CHECK:      VPlan 'Initial VPlan for VF={2,4,8,16},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VF:%.]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VEC_TC:%.]]> = vector-trip-count
+; CHECK-NEXT: Live-in vp<[[ALIAS_MASK:%.]]> = alias-mask
+; CHECK-NEXT: vp<[[TC:%.]]> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.body.preheader>:
+; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:   IR   %wide.trip.count = zext nneg i32 %n to i64
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]>
+; CHECK-NEXT:     vp<[[PTR_A:%.+]]> = vector-pointer inbounds ir<%arrayidx>
+; CHECK-NEXT:     WIDEN ir<%0> = load vp<[[PTR_A]]>, vp<[[ALIAS_MASK]]>
+; CHECK-NEXT:     CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<[[STEPS]]>
+; CHECK-NEXT:     vp<[[PTR_B:%.+]]> = vector-pointer inbounds ir<%arrayidx2>
+; CHECK-NEXT:     WIDEN ir<%1> = load vp<[[PTR_B]]>, vp<[[ALIAS_MASK]]>
+; CHECK-NEXT:     WIDEN ir<%add> = add ir<%1>, ir<%0>
+; CHECK-NEXT:     CLONE ir<%arrayidx6> = getelementptr inbounds ir<%c>, vp<[[STEPS]]>
+; CHECK-NEXT:     vp<[[PTR_C:%.+]]> = vector-pointer inbounds ir<%arrayidx6>
+; CHECK-NEXT:     WIDEN store vp<[[PTR_C]]>, ir<%add>, vp<[[ALIAS_MASK]]>
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT branch-on-count vp<%index.next>, vp<[[VEC_TC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<%cmp.n> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT:   EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[VEC_TC]]>, middle.block ], [ ir<0>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT: Successor(s): ir-bb<for.body>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.body>:
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:   IR   %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+; CHECK-NEXT:   IR   %0 = load i8, ptr %arrayidx, align 1
+; CHECK-NEXT:   IR   %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+; CHECK-NEXT:   IR   %1 = load i8, ptr %arrayidx2, align 1
+; CHECK-NEXT:   IR   %add = add i8 %1, %0
+; CHECK-NEXT:   IR   %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
+; CHECK-NEXT:   IR   store i8 %add, ptr %arrayidx6, align 1
+; CHECK-NEXT:   IR   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT:   IR   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+; CHECK: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4,vscale x 8,vscale x 16},UF={1}' {
+; CHECK-NEXT: Live-in ir<%wide.trip.count> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.body.preheader>:
+; CHECK-NEXT:   IR   %wide.trip.count = zext nneg i32 %n to i64
+; CHECK-NEXT:   IR   [[VSCALE:%.+]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:   IR   [[MINTC:%.+]] = shl nuw i64 [[VSCALE]], 4
+; CHECK-NEXT:   EMIT vp<%min.iters.check> = icmp ult ir<%wide.trip.count>, ir<[[MINTC]]>
+; CHECK-NEXT:   EMIT branch-on-cond vp<%min.iters.check>
+; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, vector.min.vf.check
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.min.vf.check:
+; CHECK-NEXT:   EMIT-SCALAR vp<[[PTR_B:%.+]]> = inttoptr ir<%b2> to ptr
+; CHECK-NEXT:   EMIT-SCALAR vp<[[PTR_C:%.+]]> = inttoptr ir<%c1> to ptr
+; CHECK-NEXT:   WIDEN-INTRINSIC vp<[[ALIAS_MASK:%.+]]> = call llvm.loop.dependence.war.mask(vp<[[PTR_B]]>, vp<[[PTR_C]]>, ir<1>)
+; CHECK-NEXT:   EMIT vp<[[CLAMPED_VF:%.+]]> = num-active-lanes vp<[[ALIAS_MASK]]>
+; CHECK-NEXT:   EMIT vp<%cmp.vf> = icmp ult vp<[[CLAMPED_VF]]>, ir<2>
+; CHECK-NEXT:   EMIT branch-on-cond vp<%cmp.vf>
+; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   EMIT vp<%n.mod.vf> = urem ir<%wide.trip.count>, vp<[[CLAMPED_VF]]>
+; CHECK-NEXT:   EMIT vp<%n.vec> = sub ir<%wide.trip.count>, vp<%n.mod.vf>
+; CHECK-NEXT: Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT:   EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:   CLONE ir<[[VEC_PTR_A:%.+]]> = getelementptr inbounds ir<%a>, vp<%index>
+; CHECK-NEXT:   WIDEN ir<[[VEC_A:%.+]]> = load ir<[[VEC_PTR_A]]>, vp<[[ALIAS_MASK]]>
+; CHECK-NEXT:   CLONE ir<[[VEC_PTR_B:%.+]]> = getelementptr inbounds ir<%b>, vp<%index>
+; CHECK-NEXT:   WIDEN ir<[[VEC_B:%.+]]> = load ir<[[VEC_PTR_B]]>, vp<[[ALIAS_MASK]]>
+; CHECK-NEXT:   WIDEN ir<[[ADD:%.+]]> = add ir<[[VEC_B]]>, ir<[[VEC_A]]>
+; CHECK-NEXT:   CLONE ir<[[VEC_PTR_C:%.+]]> = getelementptr inbounds ir<%c>, vp<%index>
+; CHECK-NEXT:   WIDEN store ir<[[VEC_PTR_C]]>, ir<[[ADD]]>, vp<[[ALIAS_MASK]]>
+; CHECK-NEXT:   EMIT vp<%index.next> = add nuw vp<%index>, vp<[[CLAMPED_VF]]>
+; CHECK-NEXT:   EMIT vp<[[EXIT_COND:%.+]]> = icmp eq vp<%index.next>, vp<%n.vec>
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[EXIT_COND]]>
+; CHECK-NEXT: Successor(s): middle.block, vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<%cmp.n> = icmp eq ir<%wide.trip.count>, vp<%n.vec>
+; CHECK-NEXT:   EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<scalar.ph>:
+; CHECK-NEXT:   EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%n.vec>, middle.block ], [ ir<0>, ir-bb<for.body.preheader> ], [ ir<0>, vector.min.vf.check ]
+; CHECK-NEXT: Successor(s): ir-bb<for.body>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.body>:
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
+; CHECK-NEXT:   IR   %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+; CHECK-NEXT:   IR   %2 = load i8, ptr %arrayidx, align 1
+; CHECK-NEXT:   IR   %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+; CHECK-NEXT:   IR   %3 = load i8, ptr %arrayidx2, align 1
+; CHECK-NEXT:   IR   %add = add i8 %3, %2
+; CHECK-NEXT:   IR   %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
+; CHECK-NEXT:   IR   store i8 %add, ptr %arrayidx6, align 1
+; CHECK-NEXT:   IR   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT:   IR   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+
+; CHECK-TF-LABEL: 'alias_mask'
+; CHECK-TF:      VPlan 'Initial VPlan for VF={2,4,8,16},UF>=1' {
+; CHECK-TF-NEXT: Live-in vp<[[VF:%.]]> = VF
+; CHECK-TF-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
+; CHECK-TF-NEXT: Live-in vp<[[ALIAS_MASK:%.]]> = alias-mask
+; CHECK-TF-NEXT: vp<[[TC:%.]]> = original trip-count
+; CHECK-TF-EMPTY:
+; CHECK-TF-NEXT: ir-bb<for.body.preheader>:
+; CHECK-TF-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-TF-NEXT:   IR   %wide.trip.count = zext nneg i32 %n to i64
+; CHECK-TF-NEXT: Successor(s): scalar.ph, vector.ph
+; CHECK-TF-EMPTY:
+; CHECK-TF-NEXT: vector.ph:
+; CHECK-TF-NEXT:   EMIT vp<%index.part.next> = VF * Part + ir<0>
+; CHECK-TF-NEXT:   EMIT vp<%active.lane.mask.entry> = active lane mask vp<%index.part.next>, vp<[[TC]]>, ir<1>
+; CHECK-TF-NEXT: Successor(s): vector loop
+; CHECK-TF-EMPTY:
+; CHECK-TF-NEXT: <x1> vector loop: {
+; CHECK-TF-NEXT:   vector.body:
+; CHECK-TF-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-TF-NEXT:     ACTIVE-LANE-MASK-PHI vp<[[LANE_MASK:%.+]]> = phi vp<%active.lane.mask.entry>, vp<%active.lane.mask.next>
+; CHECK-TF-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
+; CHECK-TF-NEXT:     EMIT vp<[[MASK:%.+]]> = and vp<[[LANE_MASK]]>, vp<[[ALIAS_MASK]]>
+; CHECK-TF-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]>
+; CHECK-TF-NEXT:     vp<[[PTR_A:%.+]]> = vector-pointer inbounds ir<%arrayidx>
+; CHECK-TF-NEXT:     WIDEN ir<%0> = load vp<[[PTR_A]]>, vp<[[MASK]]>
+; CHECK-TF-NEXT:     CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<[[STEPS]]>
+; CHECK-TF-NEXT:     vp<[[PTR_B:%.+]]> = vector-pointer inbounds ir<%arrayidx2>
+; CHECK-TF-NEXT:     WIDEN ir<%1> = load vp<[[PTR_B]]>, vp<[[MASK]]>
+; CHECK-TF-NEXT:     WIDEN ir<%add> = add ir<%1>, ir<%0>
+; CHECK-TF-NEXT:     CLONE ir<%arrayidx6> = getelementptr inbounds ir<%c>, vp<[[STEPS]]>
+; CHECK-TF-NEXT:     vp<[[PTR_C:%.+]]> = vector-pointer inbounds ir<%arrayidx6>
+; CHECK-TF-NEXT:     WIDEN store vp<[[PTR_C]]>, ir<%add>, vp<[[MASK]]>
+; CHECK-TF-NEXT:     EMIT vp<%index.next> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-TF-NEXT:     EMIT vp<[[PART_IDX:%.+]]> = VF * Part + vp<%index.next>
+; CHECK-TF-NEXT:     EMIT vp<%active.lane.mask.next> = active lane mask vp<[[PART_IDX]]>, vp<[[TC]]>, ir<1>
+; CHECK-TF-NEXT:     EMIT vp<[[NOT_MASK:%.+]]> = not vp<%active.lane.mask.next>
+; CHECK-TF-NEXT:     EMIT branch-on-cond vp<[[NOT_MASK]]>
+; CHECK-TF-NEXT:   No successors
+; CHECK-TF-NEXT: }
+; CHECK-TF-NEXT: Successor(s): middle.block
+; CHECK-TF-EMPTY:
+; CHECK-TF-NEXT: middle.block:
+; CHECK-TF-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>
+; CHECK-TF-EMPTY:
+; CHECK-TF-NEXT: ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-TF-NEXT: No successors
+; CHECK-TF-EMPTY:
+; CHECK-TF-NEXT: scalar.ph:
+; CHECK-TF-NEXT:   EMIT-SCALAR vp<%bc.resume.val> = phi [ ir<0>, ir-bb<for.body.preheader> ]
+; CHECK-TF-NEXT: Successor(s): ir-bb<for.body>
+; CHECK-TF-EMPTY:
+; CHECK-TF-NEXT: ir-bb<for.body>:
+; CHECK-TF-NEXT:   IR   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-TF-NEXT:   IR   %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+; CHECK-TF-NEXT:   IR   %0 = load i8, ptr %arrayidx, align 1
+; CHECK-TF-NEXT:   IR   %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+; CHECK-TF-NEXT:   IR   %1 = load i8, ptr %arrayidx2, align 1
+; CHECK-TF-NEXT:   IR   %add = add i8 %1, %0
+; CHECK-TF-NEXT:   IR   %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
+; CHECK-TF-NEXT:   IR   store i8 %add, ptr %arrayidx6, align 1
+; CHECK-TF-NEXT:   IR   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-TF-NEXT:   IR   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+; CHECK-TF-NEXT: No successors
+; CHECK-TF-NEXT: }
+
+; CHECK-TF:      VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4,vscale x 8,vscale x 16},UF={1}' {
+; CHECK-TF-NEXT: Live-in ir<%wide.trip.count> = original trip-count
+; CHECK-TF-EMPTY:
+; CHECK-TF-NEXT: ir-bb<for.body.preheader>:
+; CHECK-TF-NEXT:   IR   %wide.trip.count = zext nneg i32 %n to i64
+; CHECK-TF-NEXT: Successor(s): vector.min.vf.check
+; CHECK-TF-EMPTY:
+; CHECK-TF-NEXT: vector.min.vf.check:
+; CHECK-TF-NEXT:   EMIT-SCALAR vp<[[PTR_B:%.+]]> = inttoptr ir<%b2> to ptr
+; CHECK-TF-NEXT:   EMIT-SCALAR vp<[[PTR_C:%.+]]> = inttoptr ir<%c1> to ptr
+; CHECK-TF-NEXT:   WIDEN-INTRINSIC vp<[[ALIAS_MASK:%.+]]> = call llvm.loop.dependence.war.mask(vp<[[PTR_B]]>, vp<[[PTR_C]]>, ir<1>)
+; CHECK-TF-NEXT:   EMIT vp<[[CLAMPED_VF:%.+]]> = num-active-lanes vp<[[ALIAS_MASK]]>
+; CHECK-TF-NEXT:   EMIT vp<%cmp.vf> = icmp ult vp<[[CLAMPED_VF]]>, ir<2>
+; CHECK-TF-NEXT:   EMIT branch-on-cond vp<%cmp.vf>
+; CHECK-TF-NEXT: Successor(s): ir-bb<scalar.ph>, vector.ph
+; CHECK-TF-EMPTY:
+; CHECK-TF-NEXT: vector.ph:
+; CHECK-TF-NEXT:   EMIT vp<[[TC_MINUS_VF:%.+]]> = TC > VF ? TC - VF : 0 ir<%wide.trip.count>
+; CHECK-TF-NEXT:   EMIT vp<%active.lane.mask.entry> = active lane mask ir<0>, ir<%wide.trip.count>, ir<1>
+; CHECK-TF-NEXT: Successor(s): vector.body
+; CHECK-TF-EMPTY:
+; CHECK-TF-NEXT: vector.body:
+; CHECK-TF-NEXT:   EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-TF-NEXT:   ACTIVE-LANE-MASK-PHI vp<[[LANE_MASK:%.+]]> = phi vp<%active.lane.mask.entry>, vp<%active.lane.mask.next>
+; CHECK-TF-NEXT:   EMIT vp<[[MASK:%.+]]> = and vp<[[LANE_MASK]]>, vp<[[ALIAS_MASK]]>
+; CHECK-TF-NEXT:   CLONE ir<[[VEC_PTR_A:%.+]]> = getelementptr inbounds ir<%a>, vp<%index>
+; CHECK-TF-NEXT:   WIDEN ir<[[VEC_A:%.+]]> = load ir<[[VEC_PTR_A]]>, vp<[[MASK]]>
+; CHECK-TF-NEXT:   CLONE ir<[[VEC_PTR_B:%.+]]> = getelementptr inbounds ir<%b>, vp<%index>
+; CHECK-TF-NEXT:   WIDEN ir<[[VEC_B:%.+]]> = load ir<[[VEC_PTR_B]]>, vp<[[MASK]]>
+; CHECK-TF-NEXT:   WIDEN ir<[[ADD:%.+]]> = add ir<[[VEC_B]]>, ir<[[VEC_A]]>
+; CHECK-TF-NEXT:   CLONE ir<[[VEC_PTR_C:%.+]]> = getelementptr inbounds ir<%c>, vp<%index>
+; CHECK-TF-NEXT:   WIDEN store ir<[[VEC_PTR_C]]>, ir<[[ADD]]>, vp<[[MASK]]>
+; CHECK-TF-NEXT:   EMIT vp<%index.next> = add vp<%index>, vp<[[CLAMPED_VF]]>
+; CHECK-TF-NEXT:   EMIT vp<%active.lane.mask.next> = active lane mask vp<%index>, vp<[[TC_MINUS_VF]]>, ir<1>
+; CHECK-TF-NEXT:   EMIT vp<[[EXIT_COND:%.+]]> = not vp<%active.lane.mask.next>
+; CHECK-TF-NEXT:   EMIT branch-on-cond vp<[[EXIT_COND]]>
+; CHECK-TF-NEXT: Successor(s): middle.block, vector.body
+; CHECK-TF-EMPTY:
+; CHECK-TF-NEXT: middle.block:
+; CHECK-TF-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>
+; CHECK-TF-EMPTY:
+; CHECK-TF-NEXT: ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-TF-NEXT: No successors
+; CHECK-TF-EMPTY:
+; CHECK-TF-NEXT: ir-bb<scalar.ph>:
+; CHECK-TF-NEXT: Successor(s): ir-bb<for.body>
+; CHECK-TF-EMPTY:
+; CHECK-TF-NEXT: ir-bb<for.body>:
+; CHECK-TF-NEXT:   IR   %indvars.iv = phi i64 [ 0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: ir<0> from ir-bb<scalar.ph>)
+; CHECK-TF-NEXT:   IR   %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+; CHECK-TF-NEXT:   IR   %0 = load i8, ptr %arrayidx, align 1
+; CHECK-TF-NEXT:   IR   %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+; CHECK-TF-NEXT:   IR   %1 = load i8, ptr %arrayidx2, align 1
+; CHECK-TF-NEXT:   IR   %add = add i8 %1, %0
+; CHECK-TF-NEXT:   IR   %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
+; CHECK-TF-NEXT:   IR   store i8 %add, ptr %arrayidx6, align 1
+; CHECK-TF-NEXT:   IR   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-TF-NEXT:   IR   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+; CHECK-TF-NEXT: No successors
+; CHECK-TF-NEXT: }
+
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %add = add i8 %1, %0
+  %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
+  store i8 %add, ptr %arrayidx6, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll
index 0d923183e251a..2a77ae3609ad7 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll
@@ -478,12 +478,12 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
 ; CHECK-NEXT:  Live-in vp<[[VP2:%[0-9]+]]> = vector-trip-count
 ; CHECK-NEXT:  vp<[[VP3:%[0-9]+]]> = original trip-count
 ; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<entry>:
-; CHECK-NEXT:    IR   %div = udiv i64 %y, 492802768830814060
-; CHECK-NEXT:    IR   %inc = add i64 %div, 1
-; CHECK-NEXT:    EMIT vp<[[VP3]]> = EXPAND SCEV (1 + ((15 + (%y /u 492802768830814060))<nuw><nsw> /u (1 + (%y /u 492802768830814060))<nuw><nsw>))<nuw><nsw>
-; CHECK-NEXT:    EMIT vp<[[VP4:%[0-9]+]]> = EXPAND SCEV (1 + (%y /u 492802768830814060))<nuw><nsw>
-; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT:   EMIT vp<[[VP4:%.+]]> = EXPAND SCEV (1 + (%y /u 492802768830814060))<nuw><nsw>
+; CHECK-NEXT:   EMIT vp<[[VP3]]> = EXPAND SCEV (1 + ((15 + (%y /u 492802768830814060))<nuw><nsw> /u (1 + (%y /u 492802768830814060))<nuw><nsw>))<nuw><nsw>
+; CHECK-NEXT:   IR   %div = udiv i64 %y, 492802768830814060
+; CHECK-NEXT:   IR   %inc = add i64 %div, 1
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
 ; CHECK-NEXT:    vp<[[VP5:%[0-9]+]]> = DERIVED-IV ir<0> + vp<[[VP2]]> * vp<[[VP4]]>
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
index fe0c839f3f030..fae0e7bab72e6 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
@@ -437,8 +437,8 @@ define i64 @ivopt_widen_ptr_indvar_1(ptr noalias %a, i64 %stride, i64 %n) {
 ;
 ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_1(
 ; STRIDED-NEXT:  entry:
-; STRIDED-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
 ; STRIDED-NEXT:    [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3
+; STRIDED-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
 ; STRIDED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
 ; STRIDED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; STRIDED:       vector.ph:
@@ -522,8 +522,8 @@ define i64 @ivopt_widen_ptr_indvar_2(ptr noalias %a, i64 %stride, i64 %n) {
 ;
 ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_2(
 ; STRIDED-NEXT:  entry:
-; STRIDED-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
 ; STRIDED-NEXT:    [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3
+; STRIDED-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
 ; STRIDED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
 ; STRIDED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; STRIDED:       vector.ph:
@@ -629,8 +629,8 @@ define i64 @ivopt_widen_ptr_indvar_3(ptr noalias %a, i64 %stride, i64 %n) {
 ;
 ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_3(
 ; STRIDED-NEXT:  entry:
-; STRIDED-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
 ; STRIDED-NEXT:    [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3
+; STRIDED-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
 ; STRIDED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
 ; STRIDED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; STRIDED:       vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
index 55c73cb0928ff..c97fc36ac76d1 100644
--- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
@@ -205,10 +205,15 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog
 ; CHECK-NEXT:    br i1 [[INVAR_C]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_1]]
 ; CHECK:       [[LOOP_2_PREHEADER]]:
 ; CHECK-NEXT:    [[IV_1_LCSSA:%.*]] = phi i32 [ [[IV_1]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 2, [[STEP]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[IV_1_LCSSA]], [[TMP0]]
+; CHECK-NEXT:    [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[INDVAR]], -1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP3]], -1
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[SMAX1]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[IV_1_LCSSA]], [[STEP]]
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 0)
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[STEP]], -2
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[INDVAR]], -1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[SMAX]], [[TMP4]]
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 1)
@@ -217,11 +222,6 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[STEP]], i32 1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = udiv i32 [[TMP7]], [[UMAX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP16:%.*]] = sub i32 2, [[STEP]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[IV_1_LCSSA]], [[TMP16]]
-; CHECK-NEXT:    [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 0)
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP3]], -1
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[SMAX1]], [[TMP14]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP15]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
 ; CHECK:       [[VECTOR_SCEVCHECK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-alias-mask.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-alias-mask.ll
new file mode 100644
index 0000000000000..1eb9bcafb9a9d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-alias-mask.ll
@@ -0,0 +1,172 @@
+; REQUIRES: asserts
+; RUN: opt -S -debug-only=loop-vectorize -force-vector-width=4 -passes=loop-vectorize -force-partial-aliasing-vectorization -disable-output %s 2>&1 | FileCheck %s
+
+define void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: 'alias_mask'
+; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in vp<[[ALIAS_MASK:%.+]]> = alias-mask
+; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.body.preheader>:
+; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:   IR   %wide.trip.count = zext nneg i32 %n to i64
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:   Successor(s): pred.load
+; CHECK-EMPTY:
+; CHECK-NEXT:   <xVFxUF> pred.load: {
+; CHECK-NEXT:     pred.load.entry:
+; CHECK-NEXT:       BRANCH-ON-MASK vp<[[ALIAS_MASK]]>
+; CHECK-NEXT:     Successor(s): pred.load.if, pred.load.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:     pred.load.if:
+; CHECK-NEXT:       REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]>
+; CHECK-NEXT:       REPLICATE ir<%0> = load ir<%arrayidx> (S->V)
+; CHECK-NEXT:       REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<[[STEPS]]>
+; CHECK-NEXT:       REPLICATE ir<%1> = load ir<%arrayidx2> (S->V)
+; CHECK-NEXT:     Successor(s): pred.load.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:     pred.load.continue:
+; CHECK-NEXT:       PHI-PREDICATED-INSTRUCTION vp<[[VEC_A:%.+]]> = ir<%0>
+; CHECK-NEXT:       PHI-PREDICATED-INSTRUCTION vp<[[VEC_B:%.+]]> = ir<%1>
+; CHECK-NEXT:     No successors
+; CHECK-NEXT:   }
+; CHECK-NEXT:   Successor(s): for.body.1
+; CHECK-EMPTY:
+; CHECK-NEXT:   for.body.1:
+; CHECK-NEXT:     WIDEN ir<%add> = add vp<[[VEC_B]]>, vp<[[VEC_A]]>
+; CHECK-NEXT:   Successor(s): pred.store
+; CHECK-EMPTY:
+; CHECK-NEXT:   <xVFxUF> pred.store: {
+; CHECK-NEXT:     pred.store.entry:
+; CHECK-NEXT:       BRANCH-ON-MASK vp<[[ALIAS_MASK]]>
+; CHECK-NEXT:     Successor(s): pred.store.if, pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:     pred.store.if:
+; CHECK-NEXT:       REPLICATE ir<%arrayidx6> = getelementptr inbounds ir<%c>, vp<[[STEPS]]>
+; CHECK-NEXT:       REPLICATE store ir<%add>, ir<%arrayidx6>
+; CHECK-NEXT:     Successor(s): pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:     pred.store.continue:
+; CHECK-NEXT:     No successors
+; CHECK-NEXT:   }
+; CHECK-NEXT:   Successor(s): for.body.2
+; CHECK-EMPTY:
+; CHECK-NEXT:   for.body.2:
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT branch-on-count vp<%index.next>, vp<[[VEC_TC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+
+; CHECK:      VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in ir<%wide.trip.count> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.body.preheader>:
+; CHECK-NEXT:   IR   %wide.trip.count = zext nneg i32 %n to i64
+; CHECK-NEXT:   EMIT vp<%min.iters.check> = icmp ult ir<%wide.trip.count>, ir<4>
+; CHECK-NEXT:   EMIT branch-on-cond vp<%min.iters.check>
+; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, vector.min.vf.check
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.min.vf.check:
+; CHECK-NEXT:   EMIT-SCALAR vp<[[PTR_B:%.+]]> = inttoptr ir<%b2> to ptr
+; CHECK-NEXT:   EMIT-SCALAR vp<[[PTR_C:%.+]]> = inttoptr ir<%c1> to ptr
+; CHECK-NEXT:   WIDEN-INTRINSIC vp<[[ALIAS_MASK:%.+]]> = call llvm.loop.dependence.war.mask(vp<[[PTR_B]]>, vp<[[PTR_C]]>, ir<1>)
+; CHECK-NEXT:   EMIT vp<[[CLAMPED_VF:%.+]]> = num-active-lanes vp<[[ALIAS_MASK]]>
+; CHECK-NEXT:   EMIT vp<%cmp.vf> = icmp ult vp<[[CLAMPED_VF]]>, ir<2>
+; CHECK-NEXT:   EMIT branch-on-cond vp<%cmp.vf>
+; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   EMIT vp<%n.mod.vf> = urem ir<%wide.trip.count>, vp<[[CLAMPED_VF]]>
+; CHECK-NEXT:   EMIT vp<%n.vec> = sub ir<%wide.trip.count>, vp<%n.mod.vf>
+; CHECK-NEXT: Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT:   EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, for.body.2 ]
+; CHECK-NEXT:   vp<%8> = SCALAR-STEPS vp<%index>, ir<1>, vp<[[CLAMPED_VF]]>
+; CHECK-NEXT: Successor(s): pred.load
+; CHECK-EMPTY:
+; CHECK-NEXT: <xVFxUF> pred.load: {
+; CHECK-NEXT:   pred.load.entry:
+; CHECK-NEXT:     BRANCH-ON-MASK vp<[[ALIAS_MASK]]>
+; CHECK-NEXT:   Successor(s): pred.load.if, pred.load.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:   pred.load.if:
+; CHECK-NEXT:     REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%8>
+; CHECK-NEXT:     REPLICATE ir<%0> = load ir<%arrayidx> (S->V)
+; CHECK-NEXT:     REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<%8>
+; CHECK-NEXT:     REPLICATE ir<%1> = load ir<%arrayidx2> (S->V)
+; CHECK-NEXT:   Successor(s): pred.load.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:   pred.load.continue:
+; CHECK-NEXT:     PHI-PREDICATED-INSTRUCTION vp<[[VEC_A:%.+]]> = ir<%0>
+; CHECK-NEXT:     PHI-PREDICATED-INSTRUCTION vp<[[VEC_B:%.+]]> = ir<%1>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): for.body.1
+; CHECK-EMPTY:
+; CHECK-NEXT: for.body.1:
+; CHECK-NEXT:   WIDEN ir<%add> = add vp<[[VEC_B]]>, vp<[[VEC_A]]>
+; CHECK-NEXT: Successor(s): pred.store
+; CHECK-EMPTY:
+; CHECK-NEXT: <xVFxUF> pred.store: {
+; CHECK-NEXT:   pred.store.entry:
+; CHECK-NEXT:     BRANCH-ON-MASK vp<[[ALIAS_MASK]]>
+; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:   pred.store.if:
+; CHECK-NEXT:     REPLICATE ir<%arrayidx6> = getelementptr inbounds ir<%c>, vp<%8>
+; CHECK-NEXT:     REPLICATE store ir<%add>, ir<%arrayidx6>
+; CHECK-NEXT:   Successor(s): pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:   pred.store.continue:
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): for.body.2
+; CHECK-EMPTY:
+; CHECK-NEXT: for.body.2:
+; CHECK-NEXT:   EMIT vp<%index.next> = add nuw vp<%index>, vp<[[CLAMPED_VF]]>
+; CHECK-NEXT:   EMIT vp<[[EXIT_COND:%.+]]> = icmp eq vp<%index.next>, vp<%n.vec>
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[EXIT_COND]]>
+; CHECK-NEXT: Successor(s): middle.block, vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %add = add i8 %1, %0
+  %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
+  store i8 %add, ptr %arrayidx6, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}



More information about the llvm-commits mailing list