[llvm] [LV] Mask off possibly aliasing vector lanes (PR #100579)

Tue Oct 22 00:57:54 PDT 2024

https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/100579

>From 96b8b65968c9f8b6665b001b7ee92da6164e8bf6 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 18 Oct 2024 15:49:22 +0100
Subject: [PATCH] [LV] Mask off possibly aliasing vector lanes

    When vectorising a loop that uses loads and stores, those pointers could
    overlap if their difference is less than the vector factor. For example,
    if address 20 is being stored to and address 23 is being loaded from, they
    overlap when the vector factor is 4 or higher. Currently LoopVectorize
    branches to a scalar loop in these cases with a runtime check. Howver if
    we construct a mask that disables the overlapping (aliasing) lanes then
    the vectorised loop can be safely entered, as long as the loads and
    stores are masked off.
---
 .../llvm/Analysis/TargetTransformInfo.h       |   7 ++
 .../Vectorize/LoopVectorizationPlanner.h      |  22 +++-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  89 +++++++++----
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   1 -
 llvm/lib/Transforms/Vectorize/VPlan.h         |  48 +++++++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  64 +++++++++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  68 ++++++++--
 .../Transforms/Vectorize/VPlanTransforms.h    |   6 +-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   1 +
 .../LoopVectorize/AArch64/alias_mask.ll       | 117 ++++++++++++++++++
 .../AArch64/induction-costs-sve.ll            |  25 +++-
 .../runtime-check-small-clamped-bounds.ll     |  22 ++--
 .../runtime-checks-difference.ll              |  62 +++++-----
 13 files changed, 443 insertions(+), 89 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0459941fe05cdc..860db629d2358a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -194,6 +194,13 @@ enum class TailFoldingStyle {
   DataWithEVL,
 };
 
+enum class RTCheckStyle {
+  /// Branch to scalar loop if checks fails at runtime.
+  ScalarFallback,
+  /// Form a mask based on elements which won't be a WAR or RAW hazard
+  UseSafeEltsMask,
+};
+
 struct TailFoldingInfo {
   TargetLibraryInfo *TLI;
   LoopVectorizationLegality *LVL;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 1c8d541ef2c51f..4d3a54526d7d56 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -418,7 +418,13 @@ class LoopVectorizationPlanner {
   /// Build VPlans for the specified \p UserVF and \p UserIC if they are
   /// non-zero or all applicable candidate VFs otherwise. If vectorization and
   /// interleaving should be avoided up-front, no plans are generated.
-  void plan(ElementCount UserVF, unsigned UserIC);
+  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
+  /// setting HasAliasMask to true in the case that an alias mask is generated
+  /// and the vector loop should be entered even if the pointers alias across a
+  /// loop iteration.
+  void plan(ElementCount UserVF, unsigned UserIC,
+            std::optional<ArrayRef<PointerDiffInfo>> DiffChecks,
+            bool &HasAliasMask);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
@@ -495,12 +501,22 @@ class LoopVectorizationPlanner {
   /// returned VPlan is valid for. If no VPlan can be built for the input range,
   /// set the largest included VF to the maximum VF for which no plan could be
   /// built.
-  VPlanPtr tryToBuildVPlanWithVPRecipes(VFRange &Range);
+  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
+  /// setting HasAliasMask to true in the case that an alias mask is generated
+  /// and the vector loop should be entered even if the pointers alias across a
+  /// loop iteration.
+  VPlanPtr tryToBuildVPlanWithVPRecipes(VFRange &Range,
+                                        ArrayRef<PointerDiffInfo> RTChecks,
+                                        bool &HasAliasMask);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
-  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
+  /// RTChecks contains a list of pointer pairs that an alias mask should be
+  /// generated for.
+  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF,
+                                ArrayRef<PointerDiffInfo> RTChecks,
+                                bool &HasAliasMask);
 
   // Adjust the recipes for reductions. For in-loop reductions the chain of
   // instructions leading from the loop exit instr to the phi need to be
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e8653498d32a12..6ca5581683e824 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -173,6 +173,7 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
+STATISTIC(LoopsAliasMasked, "Number of loops predicated with an alias mask");
 
 static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -1806,6 +1807,10 @@ class GeneratedRTChecks {
   PredicatedScalarEvolution &PSE;
 
 public:
+  /// Set by VPlan when the vector loop should be entered even when runtime
+  /// checks determine that pointers alias within an iteration.
+  bool HasAliasMask = false;
+
   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
                     LoopInfo *LI, TargetTransformInfo *TTI,
                     const DataLayout &DL, bool AddBranchWeights)
@@ -1847,9 +1852,11 @@ class GeneratedRTChecks {
 
     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
     if (RtPtrChecking.Need) {
-      auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
-      MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
-                                 "vector.memcheck");
+      if (!MemCheckBlock) {
+        auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
+        MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
+                                   "vector.memcheck");
+      }
 
       auto DiffChecks = RtPtrChecking.getDiffChecks();
       if (DiffChecks) {
@@ -2077,11 +2084,18 @@ class GeneratedRTChecks {
     if (OuterLoop)
       OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
 
-    BranchInst &BI =
-        *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
-    if (AddBranchWeights) {
+    // TODO: Branch to the vector preheader conditionally based on the number of
+    // non-aliasing elements. The scalar loop will likely be better if only one
+    // or two elements will be processed per vectorised loop iteration.
+
+    // Jump to the vector preheader unconditionally if it's safe to do so
+    // because an alias mask has been set up.
+    BranchInst &BI = HasAliasMask
+                         ? *BranchInst::Create(LoopVectorPreHeader)
+                         : *BranchInst::Create(Bypass, LoopVectorPreHeader,
+                                               MemRuntimeCheckCond);
+    if (!HasAliasMask && AddBranchWeights)
       setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
-    }
     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
     MemCheckBlock->getTerminator()->setDebugLoc(
         Pred->getTerminator()->getDebugLoc());
@@ -2564,7 +2578,10 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
     });
   }
 
-  LoopBypassBlocks.push_back(MemCheckBlock);
+  /// If an alias mask has been set up then we don't need the bypass as the
+  /// vector preheader will be branched to unconditionally
+  if (!RTChecks.HasAliasMask)
+    LoopBypassBlocks.push_back(MemCheckBlock);
 
   AddedSafetyChecks = true;
 
@@ -7097,7 +7114,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
   return VectorizationFactor::Disabled();
 }
 
-void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
+void LoopVectorizationPlanner::plan(
+    ElementCount UserVF, unsigned UserIC,
+    std::optional<ArrayRef<PointerDiffInfo>> RTChecks, bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
@@ -7106,6 +7125,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
     return;
 
+  ArrayRef<PointerDiffInfo> DiffChecks;
+  if (RTChecks.has_value() && useActiveLaneMask(CM.getTailFoldingStyle(true)))
+    DiffChecks = *RTChecks;
+
   // Invalidate interleave groups if all blocks of loop will be predicated.
   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
       !useMaskedInterleavedAccesses(TTI)) {
@@ -7138,7 +7161,7 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.collectInLoopReductions();
       if (CM.selectUserVectorizationFactor(UserVF)) {
         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-        buildVPlansWithVPRecipes(UserVF, UserVF);
+        buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecks, HasAliasMask);
         LLVM_DEBUG(printPlans(dbgs()));
         return;
       }
@@ -7167,8 +7190,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.collectInstsToScalarize(VF);
   }
 
-  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
-  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
+  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
+                           DiffChecks, HasAliasMask);
+  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
+                           DiffChecks, HasAliasMask);
 
   LLVM_DEBUG(printPlans(dbgs()));
 }
@@ -7690,7 +7715,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
                              CanonicalIVStartValue, State);
 
   BestVPlan.execute(&State);
-
   // 2.5 Collect reduction resume values.
   auto *ExitVPBB =
       cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
@@ -7923,7 +7947,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
   // reduction phis in the scalar loop preheader.
   if (EPI.SCEVSafetyCheck)
     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
-  if (EPI.MemSafetyCheck)
+  if (EPI.MemSafetyCheck && !RTChecks.HasAliasMask)
     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
 
@@ -8179,9 +8203,8 @@ void VPRecipeBuilder::createHeaderMask() {
 
   VPBuilder::InsertPointGuard Guard(Builder);
   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
-  VPValue *BlockMask = nullptr;
   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
-  BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
+  VPValue *BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
   BlockMaskCache[Header] = BlockMask;
 }
 
@@ -8720,14 +8743,16 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   return tryToWiden(Instr, Operands, VPBB);
 }
 
-void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
-                                                        ElementCount MaxVF) {
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
+    ElementCount MinVF, ElementCount MaxVF, ArrayRef<PointerDiffInfo> RTChecks,
+    bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
 
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
-    if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
+    if (auto Plan =
+            tryToBuildVPlanWithVPRecipes(SubRange, RTChecks, HasAliasMask)) {
       // Now optimize the initial VPlan.
       if (!Plan->hasVF(ElementCount::getFixed(1)))
         VPlanTransforms::truncateToMinimalBitwidths(*Plan,
@@ -8760,6 +8785,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 
   VPBuilder Builder(TopRegion->getExitingBasicBlock());
   // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
+  // If an alias mask is present, this will be replaced by an increment of the
+  // mask's popcount.
   auto *CanonicalIVIncrement = Builder.createOverflowingOp(
       Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
       "index.next");
@@ -8978,8 +9005,8 @@ static void addLiveOutsForFirstOrderRecurrences(
   }
 }
 
-VPlanPtr
-LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
+VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
+    VFRange &Range, ArrayRef<PointerDiffInfo> RTChecks, bool &HasAliasMask) {
 
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
 
@@ -9215,7 +9242,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     bool WithoutRuntimeCheck =
         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
-                                       WithoutRuntimeCheck);
+                                       WithoutRuntimeCheck, PSE, RTChecks);
+    if (ForControlFlow && !RTChecks.empty())
+      HasAliasMask = true;
   }
   return Plan;
 }
@@ -9699,6 +9728,7 @@ static bool processLoopInVPlanNativePath(
   // Mark the loop as already vectorized to avoid vectorizing again.
   Hints.setAlreadyVectorized();
   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
+
   return true;
 }
 
@@ -10030,18 +10060,23 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   ElementCount UserVF = Hints.getWidth();
   unsigned UserIC = Hints.getInterleave();
 
+  bool AddBranchWeights =
+      hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
+  GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
+                           AddBranchWeights);
+
   // Plan how to best vectorize.
-  LVP.plan(UserVF, UserIC);
+  LVP.plan(UserVF, UserIC,
+           LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(),
+           Checks.HasAliasMask);
   VectorizationFactor VF = LVP.computeBestVF();
+  if (Checks.HasAliasMask)
+    LoopsAliasMasked++;
   unsigned IC = 1;
 
   if (ORE->allowExtraAnalysis(LV_NAME))
     LVP.emitInvalidCostRemarks(ORE);
 
-  bool AddBranchWeights =
-      hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
-  GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
-                           AddBranchWeights);
   if (LVP.hasPlanWithVF(VF.Width)) {
     // Select the interleave count.
     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index c1b97791331bcf..161b2de2beb7ae 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -962,7 +962,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
 
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   // FIXME: Model VF * UF computation completely in VPlan.
-  assert(VFxUF.getNumUsers() && "VFxUF expected to always have users");
   unsigned UF = getUF();
   if (VF.getNumUsers()) {
     Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 59a084401cc9bf..4fb47714e0d19c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -895,6 +895,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     switch (R->getVPDefID()) {
     case VPRecipeBase::VPDerivedIVSC:
     case VPRecipeBase::VPEVLBasedIVPHISC:
+    case VPRecipeBase::VPAliasLaneMaskSC:
     case VPRecipeBase::VPExpandSCEVSC:
     case VPRecipeBase::VPInstructionSC:
     case VPRecipeBase::VPReductionEVLSC:
@@ -1270,6 +1271,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
     // last. The second operand must be a positive constant and <= VF.
     ExtractFromEnd,
     LogicalAnd, // Non-poison propagating logical And.
+    PopCount,
     // Add an offset in bytes (second operand) to a base pointer (first
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
@@ -2993,6 +2995,52 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   }
 };
 
+// Given a pointer A that is being stored to, and pointer B that is being
+// read from, both with unknown lengths, create a mask that disables
+// elements which could overlap across a loop iteration. For example, if A
+// is X and B is X + 2 with VF being 4, only the final two elements of the
+// loaded vector can be stored since they don't overlap with the stored
+// vector. %b.vec = load %b ; = [s, t, u, v]
+// [...]
+// store %a, %b.vec ; only u and v can be stored as their addresses don't
+// overlap with %a + (VF - 1)
+class VPAliasLaneMaskRecipe : public VPSingleDefRecipe {
+
+public:
+  VPAliasLaneMaskRecipe(VPValue *Src, VPValue *Sink, unsigned ElementSize)
+      : VPSingleDefRecipe(VPDef::VPAliasLaneMaskSC, {Src, Sink}),
+        ElementSize(ElementSize) {}
+
+  ~VPAliasLaneMaskRecipe() override = default;
+
+  VPAliasLaneMaskRecipe *clone() override {
+    return new VPAliasLaneMaskRecipe(getSourceValue(), getSinkValue(),
+                                     ElementSize);
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPAliasLaneMaskSC);
+
+  void execute(VPTransformState &State) override;
+
+  /// Get the VPValue* for the pointer being read from
+  VPValue *getSourceValue() const { return getOperand(0); }
+
+  // Get the size of the element(s) accessed by the pointers
+  unsigned getAccessedElementSize() const { return ElementSize; }
+
+  /// Get the VPValue* for the pointer being stored to
+  VPValue *getSinkValue() const { return getOperand(1); }
+
+private:
+  unsigned ElementSize;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// Recipe to expand a SCEV expression.
 class VPExpandSCEVRecipe : public VPSingleDefRecipe {
   const SCEV *Expr;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 41f13cc2d9a978..188dc56388dd9c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -399,6 +399,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
+  case VPInstruction::PopCount:
     return true;
   default:
     return false;
@@ -465,6 +466,29 @@ Value *VPInstruction::generate(VPTransformState &State) {
                                    {PredTy, ScalarTC->getType()},
                                    {VIVElem0, ScalarTC}, nullptr, Name);
   }
+  // Count the number of bits set in each lane and reduce the result to a scalar
+  case VPInstruction::PopCount: {
+    Value *Op = State.get(getOperand(0));
+    auto *VT = Op->getType();
+    Value *Cnt = Op;
+
+    // i1 vectors can just use the add reduction. Bigger elements need a ctpop
+    // first.
+    if (VT->getScalarSizeInBits() > 1)
+      Cnt = Builder.CreateIntrinsic(Intrinsic::ctpop, {VT}, {Cnt});
+
+    auto *VecVT = cast<VectorType>(VT);
+    // Extend to an i8 since i1 is too small to add with
+    if (VecVT->getElementType()->getScalarSizeInBits() < 8) {
+      Cnt = Builder.CreateCast(
+          Instruction::ZExt, Cnt,
+          VectorType::get(Builder.getInt8Ty(), VecVT->getElementCount()));
+    }
+
+    Cnt = Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, Cnt);
+    Cnt = Builder.CreateCast(Instruction::ZExt, Cnt, Builder.getInt64Ty());
+    return Cnt;
+  }
   case VPInstruction::FirstOrderRecurrenceSplice: {
     // Generate code to combine the previous and current values in vector v3.
     //
@@ -682,7 +706,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
 
 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractFromEnd ||
-         getOpcode() == VPInstruction::ComputeReductionResult;
+         getOpcode() == VPInstruction::ComputeReductionResult ||
+         getOpcode() == PopCount;
 }
 
 bool VPInstruction::isSingleScalar() const {
@@ -810,6 +835,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ResumePhi:
     O << "resume-phi";
     break;
+  case VPInstruction::PopCount:
+    O << "popcount";
+    break;
   case VPInstruction::ExplicitVectorLength:
     O << "EXPLICIT-VECTOR-LENGTH";
     break;
@@ -3135,6 +3163,40 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+void VPAliasLaneMaskRecipe::execute(VPTransformState &State) {
+  IRBuilderBase Builder = State.Builder;
+  Value *SinkValue = State.get(getSinkValue(), true);
+  Value *SourceValue = State.get(getSourceValue(), true);
+
+  Value *Diff = Builder.CreateSub(SourceValue, SinkValue, "sub.diff");
+  auto *Type = Diff->getType();
+  Value *MemEltSize = ConstantInt::get(Type, ElementSize);
+  Value *DiffDiv = Builder.CreateSDiv(Diff, MemEltSize, "diff");
+  // If the difference is negative then some elements may alias
+  Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_SLE, DiffDiv,
+                                  ConstantInt::get(Type, 0), "neg.compare");
+  // Splat the compare result then OR it with a lane mask
+  Value *Splat = Builder.CreateVectorSplat(State.VF, Cmp);
+  Value *DiffMask = Builder.CreateIntrinsic(
+      Intrinsic::get_active_lane_mask,
+      {VectorType::get(Builder.getInt1Ty(), State.VF), Type},
+      {ConstantInt::get(Type, 0), DiffDiv}, nullptr, "ptr.diff.lane.mask");
+  Value *Or = Builder.CreateBinOp(Instruction::BinaryOps::Or, DiffMask, Splat);
+  State.set(this, Or, /*IsScalar=*/false);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPAliasLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
+                                  VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+  getVPSingleValue()->printAsOperand(O, SlotTracker);
+  O << " = alias lane mask ";
+  getSourceValue()->printAsOperand(O, SlotTracker);
+  O << ", ";
+  getSinkValue()->printAsOperand(O, SlotTracker);
+}
+#endif
+
 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
   assert(!State.Lane && "cannot be used in per-lane");
   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d50f3c0c3f3e04..f1b01e32c62470 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1201,8 +1201,9 @@ void VPlanTransforms::optimize(VPlan &Plan) {
 //   %Negated = Not %ALM
 //   branch-on-cond %Negated
 //
-static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
-    VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
+static VPValue *addVPLaneMaskPhiAndUpdateExitBranch(
+    VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck,
+    PredicatedScalarEvolution &PSE, ArrayRef<PointerDiffInfo> RTChecks) {
   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
   auto *CanonicalIVPHI = Plan.getCanonicalIV();
@@ -1212,14 +1213,38 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
   // TODO: Check if dropping the flags is needed if
   // !DataAndControlFlowWithoutRuntimeCheck.
+  VPValue *IncVal = CanonicalIVIncrement->getOperand(1);
+  assert(IncVal != CanonicalIVPHI && "Unexpected operand order");
+
   CanonicalIVIncrement->dropPoisonGeneratingFlags();
   DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
+
   // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
   // we have to take unrolling into account. Each part needs to start at
   //   Part * VF
   auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor());
   VPBuilder Builder(VecPreheader);
 
+  // Create an alias mask for each possibly-aliasing pointer pair. If there
+  // are multiple they are combined together with ANDs.
+  VPValue *AliasMask = nullptr;
+
+  for (auto C : RTChecks) {
+    // FIXME: How to pass this info back?
+    //    HasAliasMask = true;
+    VPValue *Sink =
+        vputils::getOrCreateVPValueForSCEVExpr(Plan, C.SinkStart, *PSE.getSE());
+    VPValue *Src =
+        vputils::getOrCreateVPValueForSCEVExpr(Plan, C.SrcStart, *PSE.getSE());
+    VPAliasLaneMaskRecipe *M =
+        new VPAliasLaneMaskRecipe(Src, Sink, C.AccessSize);
+    VecPreheader->appendRecipe(M);
+    if (AliasMask)
+      AliasMask = Builder.createAnd(AliasMask, M);
+    else
+      AliasMask = M;
+  }
+
   // Create the ActiveLaneMask instruction using the correct start values.
   VPValue *TC = Plan.getTripCount();
 
@@ -1243,14 +1268,37 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       "index.part.next");
 
   // Create the active lane mask instruction in the VPlan preheader.
-  auto *EntryALM =
+  VPValue *Mask =
       Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
                            DL, "active.lane.mask.entry");
 
   // Now create the ActiveLaneMaskPhi recipe in the main loop using the
   // preheader ActiveLaneMask instruction.
-  auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
+  auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(Mask, DebugLoc());
   LaneMaskPhi->insertAfter(CanonicalIVPHI);
+  VPValue *LaneMask = LaneMaskPhi;
+  if (AliasMask) {
+    // Increment phi by correct amount.
+    Builder.setInsertPoint(CanonicalIVIncrement);
+
+    VPValue *IncrementBy = Builder.createNaryOp(VPInstruction::PopCount,
+                                                {AliasMask}, DL, "popcount");
+    Type *IVType = CanonicalIVPHI->getScalarType();
+
+    if (IVType->getScalarSizeInBits() < 64) {
+      auto *Cast =
+          new VPScalarCastRecipe(Instruction::Trunc, IncrementBy, IVType);
+      Cast->insertAfter(IncrementBy->getDefiningRecipe());
+      IncrementBy = Cast;
+    }
+    CanonicalIVIncrement->setOperand(1, IncrementBy);
+
+    // And the alias mask so the iteration only processes non-aliasing lanes
+    Builder.setInsertPoint(CanonicalIVPHI->getParent(),
+                           CanonicalIVPHI->getParent()->getFirstNonPhi());
+    LaneMask = Builder.createNaryOp(Instruction::BinaryOps::And,
+                                    {LaneMaskPhi, AliasMask}, DL);
+  }
 
   // Create the active lane mask for the next iteration of the loop before the
   // original terminator.
@@ -1269,7 +1317,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
   auto *NotMask = Builder.createNot(ALM, DL);
   Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
   OriginalTerminator->eraseFromParent();
-  return LaneMaskPhi;
+  return LaneMask;
 }
 
 /// Collect all VPValues representing a header mask through the (ICMP_ULE,
@@ -1319,7 +1367,9 @@ static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
 
 void VPlanTransforms::addActiveLaneMask(
     VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
-    bool DataAndControlFlowWithoutRuntimeCheck) {
+    bool DataAndControlFlowWithoutRuntimeCheck, PredicatedScalarEvolution &PSE,
+    ArrayRef<PointerDiffInfo> RTChecks) {
+
   assert((!DataAndControlFlowWithoutRuntimeCheck ||
           UseActiveLaneMaskForControlFlow) &&
          "DataAndControlFlowWithoutRuntimeCheck implies "
@@ -1328,14 +1378,14 @@ void VPlanTransforms::addActiveLaneMask(
   auto *FoundWidenCanonicalIVUser =
       find_if(Plan.getCanonicalIV()->users(),
               [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
-  assert(FoundWidenCanonicalIVUser &&
+  assert(FoundWidenCanonicalIVUser && *FoundWidenCanonicalIVUser &&
          "Must have widened canonical IV when tail folding!");
   auto *WideCanonicalIV =
       cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
-  VPSingleDefRecipe *LaneMask;
+  VPValue *LaneMask;
   if (UseActiveLaneMaskForControlFlow) {
     LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
-        Plan, DataAndControlFlowWithoutRuntimeCheck);
+        Plan, DataAndControlFlowWithoutRuntimeCheck, PSE, RTChecks);
   } else {
     VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
     LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 60a44bfb0dca6b..1eb975c73e495e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -77,9 +77,13 @@ struct VPlanTransforms {
   /// creation) and instead it is handled using active-lane-mask. \p
   /// DataAndControlFlowWithoutRuntimeCheck implies \p
   /// UseActiveLaneMaskForControlFlow.
+  /// RTChecks refers to the pointer pairs that need aliasing elements to be
+  /// masked off each loop iteration.
   static void addActiveLaneMask(VPlan &Plan,
                                 bool UseActiveLaneMaskForControlFlow,
-                                bool DataAndControlFlowWithoutRuntimeCheck);
+                                bool DataAndControlFlowWithoutRuntimeCheck,
+                                PredicatedScalarEvolution &PSE,
+                                ArrayRef<PointerDiffInfo> RTChecks);
 
   /// Insert truncates and extends for any truncated recipe. Redundant casts
   /// will be folded later.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index f2978b0a758b6a..e6265f96f4ada6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -336,6 +336,7 @@ class VPDef {
   using VPRecipeTy = enum {
     VPBranchOnMaskSC,
     VPDerivedIVSC,
+    VPAliasLaneMaskSC,
     VPExpandSCEVSC,
     VPIRInstructionSC,
     VPInstructionSC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
new file mode 100644
index 00000000000000..c4aafa97a334ca
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s
+
+define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: define dso_local void @alias_mask(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B4:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[C1]], [[B2]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP5]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B4]], [[C3]]
+; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 1
+; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp sle i64 [[DIFF]], 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[DIFF]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 16
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt i64 [[WIDE_TRIP_COUNT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i64 [[TMP11]], i64 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[TMP8]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD5]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP20]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP23:%.*]] = zext <vscale x 16 x i1> [[TMP8]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP24:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP23]])
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i8 [[TMP24]] to i64
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP25]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP13]])
+; CHECK-NEXT:    [[TMP26:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <vscale x 16 x i1> [[TMP26]], i32 0
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP29]], [[TMP28]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX6]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %add = add i8 %1, %0
+  %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
+  store i8 %add, ptr %arrayidx6, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 553989544c7787..fd993a1990224c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -139,14 +139,16 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:  entry:
 ; PRED-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; PRED-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; PRED-NEXT:    [[SRC3:%.*]] = ptrtoint ptr [[SRC]] to i64
+; PRED-NEXT:    [[DST2:%.*]] = ptrtoint ptr [[DST]] to i64
 ; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; PRED:       vector.memcheck:
 ; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
-; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST2]], [[SRC3]]
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
-; PRED-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
 ; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
@@ -156,6 +158,13 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; PRED-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[SRC2]], [[DST1]]
+; PRED-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 1
+; PRED-NEXT:    [[NEG_COMPARE:%.*]] = icmp sle i64 [[DIFF]], 0
+; PRED-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; PRED-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+; PRED-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
 ; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
@@ -170,9 +179,10 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP30:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
 ; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
 ; PRED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[TMP30]], <vscale x 8 x i8> poison)
 ; PRED-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
 ; PRED-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i16> [[TMP20]], [[TMP16]]
 ; PRED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
@@ -181,8 +191,11 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP25:%.*]] = trunc <vscale x 8 x i16> [[TMP24]] to <vscale x 8 x i8>
 ; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
 ; PRED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
-; PRED-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; PRED-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[TMP30]])
+; PRED-NEXT:    [[TMP31:%.*]] = zext <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 8 x i8>
+; PRED-NEXT:    [[TMP32:%.*]] = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP31]])
+; PRED-NEXT:    [[TMP33:%.*]] = zext i8 [[TMP32]] to i64
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP33]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]])
 ; PRED-NEXT:    [[TMP28:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
 ; PRED-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 8 x i1> [[TMP28]], i32 0
@@ -190,7 +203,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED:       middle.block:
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; PRED-NEXT:    br label [[LOOP:%.*]]
 ; PRED:       loop:
 ; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
index e7e63e55802fe1..53ef470e098d0d 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
@@ -185,18 +185,18 @@ define void @load_clamped_index_offset_1(ptr %A, ptr %B, i32 %N) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = urem i32 [[TMP10]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[WIDE_LOAD]], <i32 10, i32 10, i32 10, i32 10>
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = urem i32 [[TMP8]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[WIDE_LOAD]], <i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
index ecdc4ed416d47e..10e43dd97200b2 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
@@ -77,11 +77,11 @@ define void @different_steps_and_different_access_sizes(ptr %a, ptr %b, i64 %n)
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[N_SHL_2:%.]] = shl i64 %n, 2
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]]
-; CHECK-NEXT:    [[N_SHL_1:%.]] = shl i64 %n, 1
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr %a, i64 [[N_SHL_1]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr %b, [[SCEVGEP4]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr %b, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 %n, 1
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr %a, i64 [[TMP1]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr %b, [[SCEVGEP1]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr %a, [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
@@ -177,21 +177,22 @@ exit:
 define void @nested_loop_outer_iv_addrec_invariant_in_inner1(ptr %a, ptr %b, i64 %n) {
 ; CHECK-LABEL: @nested_loop_outer_iv_addrec_invariant_in_inner1(
 ; CHECK:        entry:
-; CHECK-NEXT:    [[N_SHL_2:%.]] = shl i64 %n, 2
-; CHECK-NEXT:    [[B_GEP_UPPER:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 %n, 2
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr %b, i64 [[TMP0]]
 ; CHECK-NEXT:    br label %outer
 
 ; CHECK:       outer.header:
-; CHECK:         [[OUTER_IV_SHL_2:%.]] = shl i64 %outer.iv, 2
-; CHECK-NEXT:    [[A_GEP_UPPER:%.*]] = getelementptr nuw i8, ptr %a, i64 [[OUTER_IV_SHL_2]]
-; CHECK-NEXT:    [[OUTER_IV_4:%.]] = add i64 [[OUTER_IV_SHL_2]], 4
-; CHECK-NEXT:    [[A_GEP_UPPER_4:%.*]] = getelementptr i8, ptr %a, i64 [[OUTER_IV_4]]
-; CHECK:         [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
-
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[OUTER_IV]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr nuw i8, ptr [[A:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 4
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OUTER_IV]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A_GEP_UPPER]], [[B_GEP_UPPER]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr %b, [[A_GEP_UPPER_4]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
 ;
@@ -226,22 +227,23 @@ exit:
 ; sink and source swapped.
 define void @nested_loop_outer_iv_addrec_invariant_in_inner2(ptr %a, ptr %b, i64 %n) {
 ; CHECK-LABEL: @nested_loop_outer_iv_addrec_invariant_in_inner2(
-; CHECK:        entry:
-; CHECK-NEXT:    [[N_SHL_2:%.]] = shl i64 %n, 2
-; CHECK-NEXT:    [[B_GEP_UPPER:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]]
-; CHECK-NEXT:    br label %outer
-
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N:%.*]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
 ; CHECK:       outer.header:
-; CHECK:         [[OUTER_IV_SHL_2:%.]] = shl i64 %outer.iv, 2
-; CHECK-NEXT:    [[A_GEP_UPPER:%.*]] = getelementptr nuw i8, ptr %a, i64 [[OUTER_IV_SHL_2]]
-; CHECK-NEXT:    [[OUTER_IV_4:%.]] = add i64 [[OUTER_IV_SHL_2]], 4
-; CHECK-NEXT:    [[A_GEP_UPPER_4:%.*]] = getelementptr i8, ptr %a, i64 [[OUTER_IV_4]]
-; CHECK:         [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[OUTER_IV]], 2
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr nuw i8, ptr [[A:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 4
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OUTER_IV]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
 
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr %b, [[A_GEP_UPPER_4]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A_GEP_UPPER]], [[B_GEP_UPPER]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
 ;
@@ -280,7 +282,7 @@ define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr noca
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
 ; CHECK-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
-; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[DST1]], [[SRC2]]
 ; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
 ; CHECK:       outer.loop:
 ; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ]
@@ -288,7 +290,7 @@ define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr noca
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[SUB]], 16
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
 entry: