[llvm] [LV] Add support for partial alias masking with tail folding (PR #182457)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 20 00:59:48 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Benjamin Maxwell (MacDue)
<details>
<summary>Changes</summary>
This patch adds basic support for partial alias masking, which allows entering the vector loop even when there is aliasing within a single vector iteration. It does this by clamping the VF to the safe distance between pointers. This allows the runtime VF to be anywhere from 2 to the "static" VF.
Conceptually, this transform looks like:
```
// `c` and `b` may alias.
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
```
->
```
svbool_t alias_mask = loop.dependence.war.mask(b, c);
int num_active = num_active_lanes(mask);
if (num_active >= 2) {
for (int i = 0; i < n; i += num_active) {
// ... vector loop masked with `alias_mask`
}
}
// ... scalar tail
```
This initial patch has a number of limitations:
- The loop must be tail-folded
* We intend to follow-up with full alias-masking support for loops without tail-folding
- The mask and transform is only valid for IC = 1
* Some recipes may not handle the "ClampedVF" correctly at IC > 1
* Note: On AArch64, we also only have native alias mask instructions for IC = 1
- Reverse iteration is not supported
* The mask reversal logic is not correct for the alias mask (or clamped ALM)
- First order recurrences are not supported
* The `splice.right` is not lowered correctly for clamped VFs
- This style of vectorization is not enabled by default/costed
* It can be enabled with `-force-partial-aliasing-vectorization`
* When enabled, alias masking is used instead of the standard diff checks (when legal to do so)
This PR supersedes #<!-- -->100579 (closes #<!-- -->100579).
---
Patch is 80.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/182457.diff
17 Files Affected:
- (modified) llvm/lib/Analysis/VectorUtils.cpp (+2)
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h (+4)
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+108-6)
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+3-2)
- (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+2)
- (modified) llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp (+8-2)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+20-1)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+67)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.h (+8)
- (modified) llvm/lib/Transforms/Vectorize/VPlanUtils.cpp (+17-1)
- (added) llvm/test/Transforms/LoopVectorize/AArch64/alias-mask.ll (+470)
- (added) llvm/test/Transforms/LoopVectorize/VPlan/AArch64/vplan-printing-alias-mask.ll (+96)
- (added) llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-alias-mask.ll (+117)
- (modified) llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll (+6-6)
- (added) llvm/test/Transforms/LoopVectorize/alias-mask.ll (+125)
- (modified) llvm/test/Transforms/LoopVectorize/pointer-induction.ll (+3-3)
- (modified) llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll (+6-6)
``````````diff
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index d4083c49626fe..e3cf650ddb76b 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -170,6 +170,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
return (ScalarOpdIdx == 2);
case Intrinsic::experimental_vp_splice:
return ScalarOpdIdx == 2 || ScalarOpdIdx == 4;
+ case Intrinsic::loop_dependence_war_mask:
+ return true;
default:
return false;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 54bb073eb4f81..1019849b1d011 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -668,6 +668,10 @@ class LoopVectorizationPlanner {
void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks,
bool HasBranchWeights) const;
+ VPValue *materializeAliasMask(VPlan &Plan,
+ ArrayRef<PointerDiffInfo> DiffChecks,
+ bool HasBranchWeights);
+
#ifndef NDEBUG
/// \return The most profitable vectorization factor for the available VPlans
/// and the cost of that VF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6299e8c2dbd32..5bf474a89157b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -170,6 +170,8 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
+STATISTIC(LoopsPartialAliasVectorized,
+ "Number of partial aliasing loops vectorized");
static cl::opt<bool> EnableEpilogueVectorization(
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -198,6 +200,10 @@ static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
"vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
cl::desc("The maximum allowed number of runtime memory checks"));
+static cl::opt<bool> ForcePartialAliasingVectorization(
+ "force-partial-aliasing-vectorization", cl::init(false), cl::Hidden,
+ cl::desc("Replace pointer diff checks with alias masks."));
+
// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
// that predication is preferred, and this lists all options. I.e., the
// vectorizer will try to fold the tail-loop (epilogue) into the vector body
@@ -1386,6 +1392,42 @@ class LoopVectorizationCostModel {
return getTailFoldingStyle() != TailFoldingStyle::None;
}
+ void checkIfPartialAliasMaskingIsEnabled() {
+ assert(foldTailByMasking() && "Expected tail folding to be enabled!");
+ assert(!IsPartialAliasMaskingEnabled &&
+ "Partial alias masking already checked!");
+ if (!ForcePartialAliasingVectorization ||
+ !Legal->getFixedOrderRecurrences().empty()) {
+ // Note: FixedOrderRecurrences are not supported yet as we cannot handle
+ // the required `splice.right` with the alias-mask.
+ IsPartialAliasMaskingEnabled = false;
+ return;
+ }
+ const RuntimePointerChecking *Checks = Legal->getRuntimePointerChecking();
+ if (!Checks) {
+ // Runtime checks not needed for this loop (no alias mask required).
+ IsPartialAliasMaskingEnabled = false;
+ return;
+ }
+ if (auto DiffChecks = Checks->getDiffChecks()) {
+ // We have diff checks. We can use an alias mask.
+ IsPartialAliasMaskingEnabled = !DiffChecks->empty();
+ return;
+ }
+ // Runtime checks are not diff checks (can't be replaced with alias mask).
+ IsPartialAliasMaskingEnabled = false;
+ }
+
+ void disablePartialAliasMaskingIfEnabled() {
+ if (IsPartialAliasMaskingEnabled)
+ IsPartialAliasMaskingEnabled = false;
+ }
+
+ /// Returns true if all loop blocks should have partial aliases masked.
+ bool maskPartialAliasing() const {
+ return IsPartialAliasMaskingEnabled.value_or(false);
+ }
+
/// Returns true if the use of wide lane masks is requested and the loop is
/// using tail-folding with a lane mask for control flow.
bool useWideActiveLaneMask() const {
@@ -1604,6 +1646,9 @@ class LoopVectorizationCostModel {
std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
ChosenTailFoldingStyle;
+ /// true if partial alias masking is enabled (nullopt = undecided).
+ std::optional<bool> IsPartialAliasMaskingEnabled;
+
/// true if scalable vectorization is supported and enabled.
std::optional<bool> IsScalableVectorizationAllowed;
@@ -1825,14 +1870,18 @@ class GeneratedRTChecks {
/// The kind of cost that we are calculating
TTI::TargetCostKind CostKind;
+ /// True if the loop is alias-masked (which allows us to omit diff checks).
+ bool LoopUsesAliasMasking = false;
+
public:
GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
LoopInfo *LI, TargetTransformInfo *TTI,
- TTI::TargetCostKind CostKind)
+ TTI::TargetCostKind CostKind, bool LoopUsesAliasMasking)
: DT(DT), LI(LI), TTI(TTI),
SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
- PSE(PSE), CostKind(CostKind) {}
+ PSE(PSE), CostKind(CostKind),
+ LoopUsesAliasMasking(LoopUsesAliasMasking) {}
/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
/// accurately estimate the cost of the runtime checks. The blocks are
@@ -1885,7 +1934,7 @@ class GeneratedRTChecks {
}
const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
- if (RtPtrChecking.Need) {
+ if (RtPtrChecking.Need && !LoopUsesAliasMasking) {
auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
"vector.memcheck");
@@ -3088,10 +3137,17 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
auto *Ptr = getLoadStorePointerOperand(I);
auto *ScalarTy = getLoadStoreType(I);
+ int Stride = Legal->isConsecutivePtr(ScalarTy, Ptr);
// In order to be widened, the pointer should be consecutive, first of all.
- if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
+ if (!Stride)
return false;
+ // Currently, we can't handle alias masking in reverse. Reversing the alias
+ // mask is not correct (or necessary). When combined with tail-folding the ALM
+ // should only be reversed where the alias-mask is true.
+ if (Stride < 0)
+ disablePartialAliasMaskingIfEnabled();
+
// If the instruction is a store located in a predicated block, it will be
// scalarized.
if (isScalarWithPredication(I, VF))
@@ -3747,6 +3803,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
assert(ContainsScalableVF && "Expected scalable vector factor.");
MaxFactors.FixedVF = ElementCount::getFixed(1);
+ } else {
+ checkIfPartialAliasMaskingIsEnabled();
}
return MaxFactors;
}
@@ -4465,6 +4523,13 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
return Result;
}
+ if (CM.maskPartialAliasing()) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LEV: Epilogue vectorization not supported with alias masking");
+ return Result;
+ }
+
// Not really a cost consideration, but check for unsupported cases here to
// simplify the logic.
if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
@@ -7445,6 +7510,14 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// compactness.
attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
+ VPValue *ClampedVF = nullptr;
+ if (CM.maskPartialAliasing()) {
+ ClampedVF = materializeAliasMask(
+ BestVPlan, *CM.Legal->getRuntimePointerChecking()->getDiffChecks(),
+ HasBranchWeights);
+ ++LoopsPartialAliasVectorized;
+ }
+
// Retrieving VectorPH now when it's easier while VPlan still has Regions.
VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
@@ -7481,6 +7554,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::materializeVectorTripCount(
BestVPlan, VectorPH, CM.foldTailByMasking(),
CM.requiresScalarEpilogue(BestVF.isVector()));
+ VPlanTransforms::fixupVFUsersForClampedVF(BestVPlan, ClampedVF);
VPlanTransforms::materializeFactors(BestVPlan, VectorPH, BestVF);
VPlanTransforms::cse(BestVPlan);
VPlanTransforms::simplifyRecipes(BestVPlan);
@@ -8694,6 +8768,21 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
}
}
+VPValue *LoopVectorizationPlanner::materializeAliasMask(
+ VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights) {
+ VPBasicBlock *MinVFCheck = Plan.createVPBasicBlock("vector.min.vf.check");
+ VPValue *ClampedVF = VPlanTransforms::materializeAliasMask(
+ Plan, MinVFCheck,
+ *CM.Legal->getRuntimePointerChecking()->getDiffChecks());
+ VPBuilder Builder(MinVFCheck);
+ Type *IVTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
+ // Check the "ClampedVF" from the alias mask contains at least two elements.
+ VPValue *Cond = Builder.createICmp(
+ CmpInst::ICMP_ULT, ClampedVF, Plan.getConstantInt(IVTy, 2), {}, "cmp.vf");
+ VPlanTransforms::attachCheckBlock(Plan, Cond, MinVFCheck, HasBranchWeights);
+ return ClampedVF;
+}
+
void LoopVectorizationPlanner::addMinimumIterationCheck(
VPlan &Plan, ElementCount VF, unsigned UF,
ElementCount MinProfitableTripCount) const {
@@ -8806,7 +8895,8 @@ static bool processLoopInVPlanNativePath(
VPlan &BestPlan = LVP.getPlanFor(VF.Width);
{
- GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
+ GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind,
+ CM.maskPartialAliasing());
InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -9677,7 +9767,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (ORE->allowExtraAnalysis(LV_NAME))
LVP.emitInvalidCostRemarks(ORE);
- GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
+ GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind,
+ CM.maskPartialAliasing());
if (LVP.hasPlanWithVF(VF.Width)) {
// Select the interleave count.
IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
@@ -9796,6 +9887,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
IC = 1;
}
+ if (CM.maskPartialAliasing()) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Not interleaving due to partial aliasing vectorization.\n");
+ IntDiagMsg = {
+ "PartialAliasingVectorization",
+ "Unable to interleave due to partial aliasing vectorization."};
+ InterleaveLoop = false;
+ IC = 1;
+ }
+
// Emit diagnostic messages, if any.
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a0c23df0b3c38..bd61fd4c92310 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1209,8 +1209,9 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
// part if it is scalar. In the latter case, the recipe will be removed
// during unrolling.
ExtractPenultimateElement,
- LogicalAnd, // Non-poison propagating logical And.
- LogicalOr, // Non-poison propagating logical Or.
+ LogicalAnd, // Non-poison propagating logical And.
+ LogicalOr, // Non-poison propagating logical Or.
+ NumActiveLanes, // Counts the number of active lanes in a mask.
// Add an offset in bytes (second operand) to a base pointer (first
// operand). Only generates scalar values (either for the first lane only or
// for all lanes, depending on its uses).
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 4b744b9128171..d552e1cb2c38c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -147,6 +147,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return inferScalarType(R->getOperand(0));
case Instruction::ExtractValue:
return cast<ExtractValueInst>(R->getUnderlyingValue())->getType();
+ case VPInstruction::NumActiveLanes:
+ return Type::getInt64Ty(Ctx);
default:
break;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 1af7392b904da..683c4c9bad465 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1030,13 +1030,19 @@ static void addBypassBranch(VPlan &Plan, VPBasicBlock *CheckBlockVPBB,
}
}
+void VPlanTransforms::attachCheckBlock(VPlan &Plan, VPValue *Cond,
+ VPBasicBlock *CheckBlock,
+ bool AddBranchWeights) {
+ insertCheckBlockBeforeVectorLoop(Plan, CheckBlock);
+ addBypassBranch(Plan, CheckBlock, Cond, AddBranchWeights);
+}
+
void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
BasicBlock *CheckBlock,
bool AddBranchWeights) {
VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock);
- insertCheckBlockBeforeVectorLoop(Plan, CheckBlockVPBB);
- addBypassBranch(Plan, CheckBlockVPBB, CondVPV, AddBranchWeights);
+ attachCheckBlock(Plan, CondVPV, CheckBlockVPBB, AddBranchWeights);
}
void VPlanTransforms::addMinimumIterationCheck(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 33cb1509565d5..e9cc6d27381f3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -461,6 +461,7 @@ unsigned VPInstruction::getNumOperandsForOpcode() const {
case VPInstruction::ResumeForEpilogue:
case VPInstruction::Reverse:
case VPInstruction::Unpack:
+ case VPInstruction::NumActiveLanes:
return 1;
case Instruction::ICmp:
case Instruction::FCmp:
@@ -609,6 +610,20 @@ Value *VPInstruction::generate(VPTransformState &State) {
{PredTy, ScalarTC->getType()},
{VIVElem0, ScalarTC}, nullptr, Name);
}
+ case VPInstruction::NumActiveLanes: {
+ Value *Op = State.get(getOperand(0));
+ auto *VecTy = cast<VectorType>(Op->getType());
+ assert(VecTy->getScalarSizeInBits() == 1 &&
+ "NumActiveLanes only implemented for i1 vectors");
+
+ Value *ZExt = Builder.CreateCast(
+ Instruction::ZExt, Op,
+ VectorType::get(Builder.getInt32Ty(), VecTy->getElementCount()));
+ Value *Count =
+ Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, ZExt);
+ return Builder.CreateCast(Instruction::ZExt, Count, Builder.getInt64Ty(),
+ "num.active.lanes");
+ }
case VPInstruction::FirstOrderRecurrenceSplice: {
// Generate code to combine the previous and current values in vector v3.
//
@@ -1271,7 +1286,8 @@ bool VPInstruction::isVectorToScalar() const {
getOpcode() == VPInstruction::ComputeAnyOfResult ||
getOpcode() == VPInstruction::ExtractLastActive ||
getOpcode() == VPInstruction::ComputeReductionResult ||
- getOpcode() == VPInstruction::AnyOf;
+ getOpcode() == VPInstruction::AnyOf ||
+ getOpcode() == VPInstruction::NumActiveLanes;
}
bool VPInstruction::isSingleScalar() const {
@@ -1545,6 +1561,9 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
case VPInstruction::ExtractLastActive:
O << "extract-last-active";
break;
+ case VPInstruction::NumActiveLanes:
+ O << "num-active-lanes";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 22a8edaf30eb6..ee301712b6fcb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5112,6 +5112,73 @@ void VPlanTransforms::materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH,
"VF, UF, and VFxUF not expected to be used");
}
+VPValue *
+VPlanTransforms::materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheck,
+ ArrayRef<PointerDiffInfo> DiffChecks) {
+
+ VPBuilder Builder(AliasCheck, AliasCheck->begin());
+ Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
+ Type *I64Ty = IntegerType::getInt64Ty(Plan.getContext());
+ Type *PtrTy = PointerType::getUnqual(Plan.getContext());
+
+ VPValue *AliasMask = nullptr;
+ for (PointerDiffInfo Check : DiffChecks) {
+ VPValue *Src = vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SrcStart);
+ VPValue *Sink =
+ vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SinkStart);
+
+ VPValue *SrcPtr =
+ Builder.createScalarCast(Instruction::CastOps::IntToPtr, Src, PtrTy,
+ DebugLoc::getCompilerGenerated());
+ VPValue *SinkPtr =
+ Builder.createScalarCast(Instruction::CastOps::IntToPtr, Sink, PtrTy,
+ DebugLoc::getCompilerGenerated());
+
+ VPWidenIntrinsicRecipe *WARMask = new VPWidenIntrinsicRecipe(
+ Intrinsic::loop_dependence_war_mask,
+ {SrcPtr, SinkPtr, Plan.getConstantInt(I64Ty, Check.AccessSize)}, I1Ty);
+ Builder.insert(WARMask);
+
+ if (AliasMask)
+ AliasMask = Builder.createAnd(AliasMask, WARMask);
+ else
+ AliasMask = WARMask;
+ }
+
+ Type *IVTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
+ VPValue *NumActive =
+ Builder.createNaryOp(VPInstruction::NumActiveLanes, {AliasMask});
+ VPValue *ClampedVF = Builder.createScalarZExtOrTrunc(
+ NumActive, IVTy, I64Ty, DebugLoc::getCompilerGenerated());
+
+ // Find the existing header mask.
+ VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
+ auto *HeaderMaskDef = HeaderMask->getDefiningRecipe();
+ if (HeaderMaskDef->isPhi())
+ Builder.setInsertPoint(&*HeaderMaskDef->getParent()->getFirstNonPhi());
+ else
+ Builder = VPBuilder::getToInsertAfter(HeaderMaskDef);
+
+ // Update all existing users of the header mask to "HeaderMask & AliasMask".
+ auto *ClampedHeaderMask = Builder.createAnd(HeaderMask, AliasMask);
+ HeaderMask->replaceUsesWithIf(ClampedHeaderMask, [&](VPUser &U, unsigned) {
+ return dyn_cast<VPInstruction>(&U) != ClampedHeaderMask;
+ });
+
+ return ClampedVF;
+}
+
+void VPlanTransforms::fixupVFUsersForClampedVF(VPlan &Plan,
+ VPValue *ClampedVF) {
+ if (!ClampedVF)
+ return;
+
+ assert(Plan.getConcreteUF() == 1 &&
+ "Clamped VF not support with interleaving");
+ Plan.getVF().replaceAllUsesWith(ClampedVF);
+ Plan.getVFxUF().replaceAllUsesWith(ClampedVF);
+}
+
DenseMap<const SCEV *, Value *>
VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index f2dfc166cecc9..292a97b61817c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -180,6 +180,8 @@ struct VPlanTransforms {
/// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
/// VPValue and connect the block to \p Plan, using the VPValue as branch
/// condition.
+ static void attachCheckBlock(VPlan &Plan, VPValue *Cond,
+ ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/182457
More information about the llvm-commits
mailing list