[llvm] [LV] Add support for partial alias masking with tail folding (PR #182457)

Fri Feb 20 00:59:48 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Benjamin Maxwell (MacDue)

<details>
<summary>Changes</summary>

This patch adds basic support for partial alias masking, which allows entering the vector loop even when there is aliasing within a single vector iteration. It does this by clamping the VF to the safe distance between pointers. This allows the runtime VF to be anywhere from 2 to the "static" VF.

Conceptually, this transform looks like:

```
  // `c` and `b` may alias.
  for (int i = 0; i < n; i++) {
    c[i] = a[i] + b[i];
  }
```

->

```
  svbool_t alias_mask = loop.dependence.war.mask(b, c);
  int num_active = num_active_lanes(mask);
  if (num_active >= 2) {
    for (int i = 0; i < n; i += num_active) {
      // ... vector loop masked with `alias_mask`
    }
  }
  // ... scalar tail
```

This initial patch has a number of limitations:

- The loop must be tail-folded
  * We intend to follow-up with full alias-masking support for loops without tail-folding
- The mask and transform is only valid for IC = 1
  * Some recipes may not handle the "ClampedVF" correctly at IC > 1
  * Note: On AArch64, we also only have native alias mask instructions for IC = 1
- Reverse iteration is not supported
  * The mask reversal logic is not correct for the alias mask (or clamped ALM)
- First order recurrences are not supported
  * The `splice.right` is not lowered correctly for clamped VFs
- This style of vectorization is not enabled by default/costed
  * It can be enabled with `-force-partial-aliasing-vectorization`
  * When enabled, alias masking is used instead of the standard diff checks (when legal to do so)

This PR supersedes #100579 (closes #100579).

---

Patch is 80.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/182457.diff


17 Files Affected:

- (modified) llvm/lib/Analysis/VectorUtils.cpp (+2) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h (+4) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+108-6) 
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+3-2) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+2) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp (+8-2) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+20-1) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+67) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.h (+8) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanUtils.cpp (+17-1) 
- (added) llvm/test/Transforms/LoopVectorize/AArch64/alias-mask.ll (+470) 
- (added) llvm/test/Transforms/LoopVectorize/VPlan/AArch64/vplan-printing-alias-mask.ll (+96) 
- (added) llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-alias-mask.ll (+117) 
- (modified) llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll (+6-6) 
- (added) llvm/test/Transforms/LoopVectorize/alias-mask.ll (+125) 
- (modified) llvm/test/Transforms/LoopVectorize/pointer-induction.ll (+3-3) 
- (modified) llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll (+6-6) 


``````````diff

diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index d4083c49626fe..e3cf650ddb76b 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -170,6 +170,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
     return (ScalarOpdIdx == 2);
   case Intrinsic::experimental_vp_splice:
     return ScalarOpdIdx == 2 || ScalarOpdIdx == 4;
+  case Intrinsic::loop_dependence_war_mask:
+    return true;
   default:
     return false;
   }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 54bb073eb4f81..1019849b1d011 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -668,6 +668,10 @@ class LoopVectorizationPlanner {
   void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks,
                            bool HasBranchWeights) const;
 
+  VPValue *materializeAliasMask(VPlan &Plan,
+                                ArrayRef<PointerDiffInfo> DiffChecks,
+                                bool HasBranchWeights);
+
 #ifndef NDEBUG
   /// \return The most profitable vectorization factor for the available VPlans
   /// and the cost of that VF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6299e8c2dbd32..5bf474a89157b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -170,6 +170,8 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
 STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
+STATISTIC(LoopsPartialAliasVectorized,
+          "Number of partial aliasing loops vectorized");
 
 static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -198,6 +200,10 @@ static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
     cl::desc("The maximum allowed number of runtime memory checks"));
 
+static cl::opt<bool> ForcePartialAliasingVectorization(
+    "force-partial-aliasing-vectorization", cl::init(false), cl::Hidden,
+    cl::desc("Replace pointer diff checks with alias masks."));
+
 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
 // that predication is preferred, and this lists all options. I.e., the
 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
@@ -1386,6 +1392,42 @@ class LoopVectorizationCostModel {
     return getTailFoldingStyle() != TailFoldingStyle::None;
   }
 
+  void checkIfPartialAliasMaskingIsEnabled() {
+    assert(foldTailByMasking() && "Expected tail folding to be enabled!");
+    assert(!IsPartialAliasMaskingEnabled &&
+           "Partial alias masking already checked!");
+    if (!ForcePartialAliasingVectorization ||
+        !Legal->getFixedOrderRecurrences().empty()) {
+      // Note: FixedOrderRecurrences are not supported yet as we cannot handle
+      // the required `splice.right` with the alias-mask.
+      IsPartialAliasMaskingEnabled = false;
+      return;
+    }
+    const RuntimePointerChecking *Checks = Legal->getRuntimePointerChecking();
+    if (!Checks) {
+      // Runtime checks not needed for this loop (no alias mask required).
+      IsPartialAliasMaskingEnabled = false;
+      return;
+    }
+    if (auto DiffChecks = Checks->getDiffChecks()) {
+      // We have diff checks. We can use an alias mask.
+      IsPartialAliasMaskingEnabled = !DiffChecks->empty();
+      return;
+    }
+    // Runtime checks are not diff checks (can't be replaced with alias mask).
+    IsPartialAliasMaskingEnabled = false;
+  }
+
+  void disablePartialAliasMaskingIfEnabled() {
+    if (IsPartialAliasMaskingEnabled)
+      IsPartialAliasMaskingEnabled = false;
+  }
+
+  /// Returns true if all loop blocks should have partial aliases masked.
+  bool maskPartialAliasing() const {
+    return IsPartialAliasMaskingEnabled.value_or(false);
+  }
+
   /// Returns true if the use of wide lane masks is requested and the loop is
   /// using tail-folding with a lane mask for control flow.
   bool useWideActiveLaneMask() const {
@@ -1604,6 +1646,9 @@ class LoopVectorizationCostModel {
   std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
       ChosenTailFoldingStyle;
 
+  /// true if partial alias masking is enabled (nullopt = undecided).
+  std::optional<bool> IsPartialAliasMaskingEnabled;
+
   /// true if scalable vectorization is supported and enabled.
   std::optional<bool> IsScalableVectorizationAllowed;
 
@@ -1825,14 +1870,18 @@ class GeneratedRTChecks {
   /// The kind of cost that we are calculating
   TTI::TargetCostKind CostKind;
 
+  /// True if the loop is alias-masked (which allows us to omit diff checks).
+  bool LoopUsesAliasMasking = false;
+
 public:
   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
                     LoopInfo *LI, TargetTransformInfo *TTI,
-                    TTI::TargetCostKind CostKind)
+                    TTI::TargetCostKind CostKind, bool LoopUsesAliasMasking)
       : DT(DT), LI(LI), TTI(TTI),
         SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
         MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
-        PSE(PSE), CostKind(CostKind) {}
+        PSE(PSE), CostKind(CostKind),
+        LoopUsesAliasMasking(LoopUsesAliasMasking) {}
 
   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
   /// accurately estimate the cost of the runtime checks. The blocks are
@@ -1885,7 +1934,7 @@ class GeneratedRTChecks {
     }
 
     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
-    if (RtPtrChecking.Need) {
+    if (RtPtrChecking.Need && !LoopUsesAliasMasking) {
       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
                                  "vector.memcheck");
@@ -3088,10 +3137,17 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
   auto *Ptr = getLoadStorePointerOperand(I);
   auto *ScalarTy = getLoadStoreType(I);
 
+  int Stride = Legal->isConsecutivePtr(ScalarTy, Ptr);
   // In order to be widened, the pointer should be consecutive, first of all.
-  if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
+  if (!Stride)
     return false;
 
+  // Currently, we can't handle alias masking in reverse. Reversing the alias
+  // mask is not correct (or necessary). When combined with tail-folding the ALM
+  // should only be reversed where the alias-mask is true.
+  if (Stride < 0)
+    disablePartialAliasMaskingIfEnabled();
+
   // If the instruction is a store located in a predicated block, it will be
   // scalarized.
   if (isScalarWithPredication(I, VF))
@@ -3747,6 +3803,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
       assert(ContainsScalableVF && "Expected scalable vector factor.");
 
       MaxFactors.FixedVF = ElementCount::getFixed(1);
+    } else {
+      checkIfPartialAliasMaskingIsEnabled();
     }
     return MaxFactors;
   }
@@ -4465,6 +4523,13 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
     return Result;
   }
 
+  if (CM.maskPartialAliasing()) {
+    LLVM_DEBUG(
+        dbgs()
+        << "LEV: Epilogue vectorization not supported with alias masking");
+    return Result;
+  }
+
   // Not really a cost consideration, but check for unsupported cases here to
   // simplify the logic.
   if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
@@ -7445,6 +7510,14 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // compactness.
   attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
 
+  VPValue *ClampedVF = nullptr;
+  if (CM.maskPartialAliasing()) {
+    ClampedVF = materializeAliasMask(
+        BestVPlan, *CM.Legal->getRuntimePointerChecking()->getDiffChecks(),
+        HasBranchWeights);
+    ++LoopsPartialAliasVectorized;
+  }
+
   // Retrieving VectorPH now when it's easier while VPlan still has Regions.
   VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
 
@@ -7481,6 +7554,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   VPlanTransforms::materializeVectorTripCount(
       BestVPlan, VectorPH, CM.foldTailByMasking(),
       CM.requiresScalarEpilogue(BestVF.isVector()));
+  VPlanTransforms::fixupVFUsersForClampedVF(BestVPlan, ClampedVF);
   VPlanTransforms::materializeFactors(BestVPlan, VectorPH, BestVF);
   VPlanTransforms::cse(BestVPlan);
   VPlanTransforms::simplifyRecipes(BestVPlan);
@@ -8694,6 +8768,21 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
   }
 }
 
+VPValue *LoopVectorizationPlanner::materializeAliasMask(
+    VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights) {
+  VPBasicBlock *MinVFCheck = Plan.createVPBasicBlock("vector.min.vf.check");
+  VPValue *ClampedVF = VPlanTransforms::materializeAliasMask(
+      Plan, MinVFCheck,
+      *CM.Legal->getRuntimePointerChecking()->getDiffChecks());
+  VPBuilder Builder(MinVFCheck);
+  Type *IVTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
+  // Check the "ClampedVF" from the alias mask contains at least two elements.
+  VPValue *Cond = Builder.createICmp(
+      CmpInst::ICMP_ULT, ClampedVF, Plan.getConstantInt(IVTy, 2), {}, "cmp.vf");
+  VPlanTransforms::attachCheckBlock(Plan, Cond, MinVFCheck, HasBranchWeights);
+  return ClampedVF;
+}
+
 void LoopVectorizationPlanner::addMinimumIterationCheck(
     VPlan &Plan, ElementCount VF, unsigned UF,
     ElementCount MinProfitableTripCount) const {
@@ -8806,7 +8895,8 @@ static bool processLoopInVPlanNativePath(
   VPlan &BestPlan = LVP.getPlanFor(VF.Width);
 
   {
-    GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
+    GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind,
+                             CM.maskPartialAliasing());
     InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
                            Checks, BestPlan);
     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -9677,7 +9767,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   if (ORE->allowExtraAnalysis(LV_NAME))
     LVP.emitInvalidCostRemarks(ORE);
 
-  GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
+  GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind,
+                           CM.maskPartialAliasing());
   if (LVP.hasPlanWithVF(VF.Width)) {
     // Select the interleave count.
     IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
@@ -9796,6 +9887,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     IC = 1;
   }
 
+  if (CM.maskPartialAliasing()) {
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: Not interleaving due to partial aliasing vectorization.\n");
+    IntDiagMsg = {
+        "PartialAliasingVectorization",
+        "Unable to interleave due to partial aliasing vectorization."};
+    InterleaveLoop = false;
+    IC = 1;
+  }
+
   // Emit diagnostic messages, if any.
   const char *VAPassName = Hints.vectorizeAnalysisPassName();
   if (!VectorizeLoop && !InterleaveLoop) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a0c23df0b3c38..bd61fd4c92310 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1209,8 +1209,9 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
     // part if it is scalar. In the latter case, the recipe will be removed
     // during unrolling.
     ExtractPenultimateElement,
-    LogicalAnd, // Non-poison propagating logical And.
-    LogicalOr,  // Non-poison propagating logical Or.
+    LogicalAnd,     // Non-poison propagating logical And.
+    LogicalOr,      // Non-poison propagating logical Or.
+    NumActiveLanes, // Counts the number of active lanes in a mask.
     // Add an offset in bytes (second operand) to a base pointer (first
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 4b744b9128171..d552e1cb2c38c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -147,6 +147,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
     return inferScalarType(R->getOperand(0));
   case Instruction::ExtractValue:
     return cast<ExtractValueInst>(R->getUnderlyingValue())->getType();
+  case VPInstruction::NumActiveLanes:
+    return Type::getInt64Ty(Ctx);
   default:
     break;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 1af7392b904da..683c4c9bad465 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1030,13 +1030,19 @@ static void addBypassBranch(VPlan &Plan, VPBasicBlock *CheckBlockVPBB,
   }
 }
 
+void VPlanTransforms::attachCheckBlock(VPlan &Plan, VPValue *Cond,
+                                       VPBasicBlock *CheckBlock,
+                                       bool AddBranchWeights) {
+  insertCheckBlockBeforeVectorLoop(Plan, CheckBlock);
+  addBypassBranch(Plan, CheckBlock, Cond, AddBranchWeights);
+}
+
 void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
                                        BasicBlock *CheckBlock,
                                        bool AddBranchWeights) {
   VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
   VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock);
-  insertCheckBlockBeforeVectorLoop(Plan, CheckBlockVPBB);
-  addBypassBranch(Plan, CheckBlockVPBB, CondVPV, AddBranchWeights);
+  attachCheckBlock(Plan, CondVPV, CheckBlockVPBB, AddBranchWeights);
 }
 
 void VPlanTransforms::addMinimumIterationCheck(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 33cb1509565d5..e9cc6d27381f3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -461,6 +461,7 @@ unsigned VPInstruction::getNumOperandsForOpcode() const {
   case VPInstruction::ResumeForEpilogue:
   case VPInstruction::Reverse:
   case VPInstruction::Unpack:
+  case VPInstruction::NumActiveLanes:
     return 1;
   case Instruction::ICmp:
   case Instruction::FCmp:
@@ -609,6 +610,20 @@ Value *VPInstruction::generate(VPTransformState &State) {
                                    {PredTy, ScalarTC->getType()},
                                    {VIVElem0, ScalarTC}, nullptr, Name);
   }
+  case VPInstruction::NumActiveLanes: {
+    Value *Op = State.get(getOperand(0));
+    auto *VecTy = cast<VectorType>(Op->getType());
+    assert(VecTy->getScalarSizeInBits() == 1 &&
+           "NumActiveLanes only implemented for i1 vectors");
+
+    Value *ZExt = Builder.CreateCast(
+        Instruction::ZExt, Op,
+        VectorType::get(Builder.getInt32Ty(), VecTy->getElementCount()));
+    Value *Count =
+        Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, ZExt);
+    return Builder.CreateCast(Instruction::ZExt, Count, Builder.getInt64Ty(),
+                              "num.active.lanes");
+  }
   case VPInstruction::FirstOrderRecurrenceSplice: {
     // Generate code to combine the previous and current values in vector v3.
     //
@@ -1271,7 +1286,8 @@ bool VPInstruction::isVectorToScalar() const {
          getOpcode() == VPInstruction::ComputeAnyOfResult ||
          getOpcode() == VPInstruction::ExtractLastActive ||
          getOpcode() == VPInstruction::ComputeReductionResult ||
-         getOpcode() == VPInstruction::AnyOf;
+         getOpcode() == VPInstruction::AnyOf ||
+         getOpcode() == VPInstruction::NumActiveLanes;
 }
 
 bool VPInstruction::isSingleScalar() const {
@@ -1545,6 +1561,9 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ExtractLastActive:
     O << "extract-last-active";
     break;
+  case VPInstruction::NumActiveLanes:
+    O << "num-active-lanes";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 22a8edaf30eb6..ee301712b6fcb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5112,6 +5112,73 @@ void VPlanTransforms::materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH,
          "VF, UF, and VFxUF not expected to be used");
 }
 
+VPValue *
+VPlanTransforms::materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheck,
+                                      ArrayRef<PointerDiffInfo> DiffChecks) {
+
+  VPBuilder Builder(AliasCheck, AliasCheck->begin());
+  Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
+  Type *I64Ty = IntegerType::getInt64Ty(Plan.getContext());
+  Type *PtrTy = PointerType::getUnqual(Plan.getContext());
+
+  VPValue *AliasMask = nullptr;
+  for (PointerDiffInfo Check : DiffChecks) {
+    VPValue *Src = vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SrcStart);
+    VPValue *Sink =
+        vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SinkStart);
+
+    VPValue *SrcPtr =
+        Builder.createScalarCast(Instruction::CastOps::IntToPtr, Src, PtrTy,
+                                 DebugLoc::getCompilerGenerated());
+    VPValue *SinkPtr =
+        Builder.createScalarCast(Instruction::CastOps::IntToPtr, Sink, PtrTy,
+                                 DebugLoc::getCompilerGenerated());
+
+    VPWidenIntrinsicRecipe *WARMask = new VPWidenIntrinsicRecipe(
+        Intrinsic::loop_dependence_war_mask,
+        {SrcPtr, SinkPtr, Plan.getConstantInt(I64Ty, Check.AccessSize)}, I1Ty);
+    Builder.insert(WARMask);
+
+    if (AliasMask)
+      AliasMask = Builder.createAnd(AliasMask, WARMask);
+    else
+      AliasMask = WARMask;
+  }
+
+  Type *IVTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
+  VPValue *NumActive =
+      Builder.createNaryOp(VPInstruction::NumActiveLanes, {AliasMask});
+  VPValue *ClampedVF = Builder.createScalarZExtOrTrunc(
+      NumActive, IVTy, I64Ty, DebugLoc::getCompilerGenerated());
+
+  // Find the existing header mask.
+  VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
+  auto *HeaderMaskDef = HeaderMask->getDefiningRecipe();
+  if (HeaderMaskDef->isPhi())
+    Builder.setInsertPoint(&*HeaderMaskDef->getParent()->getFirstNonPhi());
+  else
+    Builder = VPBuilder::getToInsertAfter(HeaderMaskDef);
+
+  // Update all existing users of the header mask to "HeaderMask & AliasMask".
+  auto *ClampedHeaderMask = Builder.createAnd(HeaderMask, AliasMask);
+  HeaderMask->replaceUsesWithIf(ClampedHeaderMask, [&](VPUser &U, unsigned) {
+    return dyn_cast<VPInstruction>(&U) != ClampedHeaderMask;
+  });
+
+  return ClampedVF;
+}
+
+void VPlanTransforms::fixupVFUsersForClampedVF(VPlan &Plan,
+                                               VPValue *ClampedVF) {
+  if (!ClampedVF)
+    return;
+
+  assert(Plan.getConcreteUF() == 1 &&
+         "Clamped VF not support with interleaving");
+  Plan.getVF().replaceAllUsesWith(ClampedVF);
+  Plan.getVFxUF().replaceAllUsesWith(ClampedVF);
+}
+
 DenseMap<const SCEV *, Value *>
 VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
   SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index f2dfc166cecc9..292a97b61817c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -180,6 +180,8 @@ struct VPlanTransforms {
   /// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
   /// VPValue and connect the block to \p Plan, using the VPValue as branch
   /// condition.
+  static void attachCheckBlock(VPlan &Plan, VPValue *Cond,
+                  ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/182457