[llvm] 64686c5 - [VPlan] Connect (MemRuntime|SCEV)Check blocks as VPlan transform (NFC). (#143879)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 9 05:03:30 PDT 2025
Author: Florian Hahn
Date: 2025-07-09T14:03:25+02:00
New Revision: 64686c59c376902a4328d7759b2db860d11a5650
URL: https://github.com/llvm/llvm-project/commit/64686c59c376902a4328d7759b2db860d11a5650
DIFF: https://github.com/llvm/llvm-project/commit/64686c59c376902a4328d7759b2db860d11a5650.diff
LOG: [VPlan] Connect (MemRuntime|SCEV)Check blocks as VPlan transform (NFC). (#143879)
Connect SCEV and memory runtime check block directly in VPlan as
VPIRBasicBlocks, removing ILV::emitSCEVChecks and
ILV::emitMemRuntimeChecks.
The new logic is currently split across
LoopVectorizationPlanner::addRuntimeChecks which collects a list of
{Condition, CheckBlock} pairs and performs some checks and emits remarks
if needed. The list of checks is then added to VPlan in
VPlanTransforms::connectCheckBlocks.
PR: https://github.com/llvm/llvm-project/pull/143879
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/VPlan.cpp
llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
llvm/lib/Transforms/Vectorize/VPlanTransforms.h
llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 0eaa6d9bf03da..11853859484e3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -28,6 +28,10 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/Support/InstructionCost.h"
+namespace {
+class GeneratedRTChecks;
+}
+
namespace llvm {
class LoopInfo;
@@ -554,6 +558,10 @@ class LoopVectorizationPlanner {
VPRecipeBuilder &RecipeBuilder,
ElementCount MinVF);
+ /// Attach the runtime checks of \p RTChecks to \p Plan.
+ void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks,
+ bool HasBranchWeights) const;
+
#ifndef NDEBUG
/// \return The most profitable vectorization factor for the available VPlans
/// and the cost of that VF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6bd6ba2d77bbb..992f98cec0010 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -399,12 +399,6 @@ static cl::opt<bool> EnableEarlyExitVectorization(
cl::desc(
"Enable vectorization of early exit loops with uncountable exits."));
-// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
-// variables not overflowing do not hold. See `emitSCEVChecks`.
-static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
-// Likelyhood of bypassing the vectorized loop because pointers overlap. See
-// `emitMemRuntimeChecks`.
-static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
// Likelyhood of bypassing the vectorized loop because there are zero trips left
// after prolog. See `emitIterationCountCheck`.
static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
@@ -544,16 +538,6 @@ class InnerLoopVectorizer {
/// it overflows.
void emitIterationCountCheck(BasicBlock *Bypass);
- /// Emit a bypass check to see if all of the SCEV assumptions we've
- /// had to make are correct. Returns the block containing the checks or
- /// nullptr if no checks have been added.
- BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
-
- /// Emit bypass checks to check any memory assumptions we may have made.
- /// Returns the block containing the checks or nullptr if no checks have been
- /// added.
- BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
-
/// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
/// vector loop preheader, middle block and scalar preheader.
void createVectorLoopSkeleton(StringRef Prefix);
@@ -657,8 +641,6 @@ struct EpilogueLoopVectorizationInfo {
unsigned EpilogueUF = 0;
BasicBlock *MainLoopIterationCountCheck = nullptr;
BasicBlock *EpilogueIterationCountCheck = nullptr;
- BasicBlock *SCEVSafetyCheck = nullptr;
- BasicBlock *MemSafetyCheck = nullptr;
Value *TripCount = nullptr;
Value *VectorTripCount = nullptr;
VPlan &EpiloguePlan;
@@ -1786,7 +1768,6 @@ class GeneratedRTChecks {
SCEVExpander MemCheckExp;
bool CostTooHigh = false;
- const bool AddBranchWeights;
Loop *OuterLoop = nullptr;
@@ -1798,11 +1779,10 @@ class GeneratedRTChecks {
public:
GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
LoopInfo *LI, TargetTransformInfo *TTI,
- const DataLayout &DL, bool AddBranchWeights,
- TTI::TargetCostKind CostKind)
+ const DataLayout &DL, TTI::TargetCostKind CostKind)
: DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
- MemCheckExp(*PSE.getSE(), DL, "scev.check"),
- AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
+ MemCheckExp(*PSE.getSE(), DL, "scev.check"), PSE(PSE),
+ CostKind(CostKind) {}
/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
/// accurately estimate the cost of the runtime checks. The blocks are
@@ -2019,56 +1999,20 @@ class GeneratedRTChecks {
MemCheckBlock->eraseFromParent();
}
- /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
- /// adjusts the branches to branch to the vector preheader or \p Bypass,
- /// depending on the generated condition.
- BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
- BasicBlock *LoopVectorPreHeader) {
+ /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
+ /// outside VPlan.
+ std::pair<Value *, BasicBlock *> getSCEVChecks() {
using namespace llvm::PatternMatch;
if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt()))
- return nullptr;
-
- auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
- BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
-
- SCEVCheckBlock->getTerminator()->eraseFromParent();
- SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
- Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
- SCEVCheckBlock);
-
- BranchInst &BI =
- *BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond);
- if (AddBranchWeights)
- setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
- ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
- return SCEVCheckBlock;
- }
-
- /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
- /// the branches to branch to the vector preheader or \p Bypass, depending on
- /// the generated condition.
- BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
- BasicBlock *LoopVectorPreHeader) {
- // Check if we generated code that checks in runtime if arrays overlap.
- if (!MemRuntimeCheckCond)
- return nullptr;
-
- auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
- Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
- MemCheckBlock);
+ return {nullptr, nullptr};
- MemCheckBlock->moveBefore(LoopVectorPreHeader);
-
- BranchInst &BI =
- *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
- if (AddBranchWeights) {
- setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
- }
- ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
- MemCheckBlock->getTerminator()->setDebugLoc(
- Pred->getTerminator()->getDebugLoc());
+ return {SCEVCheckCond, SCEVCheckBlock};
+ }
- return MemCheckBlock;
+ /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
+ /// outside VPlan.
+ std::pair<Value *, BasicBlock *> getMemRuntimeChecks() {
+ return {MemRuntimeCheckCond, MemCheckBlock};
}
/// Return true if any runtime checks have been added
@@ -2461,53 +2405,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
"Plan's entry must be TCCCheckBlock");
}
-BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
- BasicBlock *const SCEVCheckBlock =
- RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
- if (!SCEVCheckBlock)
- return nullptr;
-
- assert((!Cost->OptForSize ||
- Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
- "Cannot SCEV check stride or overflow when optimizing for size");
-
- introduceCheckBlockInVPlan(SCEVCheckBlock);
- return SCEVCheckBlock;
-}
-
-BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
- BasicBlock *const MemCheckBlock =
- RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
-
- // Check if we generated code that checks in runtime if arrays overlap. We put
- // the checks into a separate block to make the more common case of few
- // elements faster.
- if (!MemCheckBlock)
- return nullptr;
-
- // VPlan-native path does not do any analysis for runtime checks currently.
- assert((!EnableVPlanNativePath || OrigLoop->begin() == OrigLoop->end()) &&
- "Runtime checks are not supported for outer loops yet");
-
- if (Cost->OptForSize) {
- assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
- "Cannot emit memory checks when optimizing for size, unless forced "
- "to vectorize.");
- ORE->emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
- OrigLoop->getStartLoc(),
- OrigLoop->getHeader())
- << "Code-size may be reduced by not forcing "
- "vectorization, or by source-code modifications "
- "eliminating the need for runtime checks "
- "(e.g., adding 'restrict').";
- });
- }
-
- introduceCheckBlockInVPlan(MemCheckBlock);
- return MemCheckBlock;
-}
-
/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
@@ -2624,15 +2521,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// to the scalar loop.
emitIterationCountCheck(LoopScalarPreHeader);
- // Generate the code to check any assumptions that we've made for SCEV
- // expressions.
- emitSCEVChecks(LoopScalarPreHeader);
-
- // Generate the code that checks in runtime if arrays overlap. We put the
- // checks into a separate block to make the more common case of few elements
- // faster.
- emitMemRuntimeChecks(LoopScalarPreHeader);
-
replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
return LoopVectorPreHeader;
}
@@ -7323,11 +7211,22 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
OrigLoop->getHeader()->getContext());
VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
- if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+ bool HasBranchWeights =
+ hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
+ if (HasBranchWeights) {
std::optional<unsigned> VScale = CM.getVScaleForTuning();
VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator,
BestVPlan, BestVF, VScale);
}
+
+ if (!VectorizingEpilogue) {
+ // Checks are the same for all VPlans, added to BestVPlan only for
+ // compactness.
+ attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
+ }
+
+ // Retrieving VectorPH now when it's easier while VPlan still has Regions.
+ VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
VPlanTransforms::narrowInterleaveGroups(
@@ -7375,7 +7274,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// 1. Set up the skeleton for vectorization, including vector pre-header and
// middle block. The vector loop is created during VPlan execution.
- VPBasicBlock *VectorPH = cast<VPBasicBlock>(Entry->getSuccessors()[1]);
+ BasicBlock *EntryBB =
+ cast<VPIRBasicBlock>(BestVPlan.getEntry())->getIRBasicBlock();
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
if (VectorizingEpilogue)
VPlanTransforms::removeDeadRecipes(BestVPlan);
@@ -7399,6 +7299,13 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
+ // Move check blocks to their final position.
+ // TODO: Move as part of VPIRBB execute and update impacted tests.
+ if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemRuntimeChecks().second)
+ MemCheckBlock->moveAfter(EntryBB);
+ if (BasicBlock *SCEVCheckBlock = ILV.RTChecks.getSCEVChecks().second)
+ SCEVCheckBlock->moveAfter(EntryBB);
+
BestVPlan.execute(&State);
// 2.5 When vectorizing the epilogue, fix reduction resume values from the
@@ -7499,15 +7406,6 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
emitIterationCountCheck(LoopScalarPreHeader, true);
EPI.EpilogueIterationCountCheck->setName("iter.check");
- // Generate the code to check any assumptions that we've made for SCEV
- // expressions.
- EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
-
- // Generate the code that checks at runtime if arrays overlap. We put the
- // checks into a separate block to make the more common case of few elements
- // faster.
- EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
-
// Generate the iteration count check for the main loop, *after* the check
// for the epilogue loop, so that the path-length is shorter for the case
// that goes directly through the vector epilogue. The longer-path length for
@@ -7611,11 +7509,14 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
- if (EPI.SCEVSafetyCheck)
- EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
+ // Adjust the terminators of runtime check blocks and phis using them.
+ BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second;
+ BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second;
+ if (SCEVCheckBlock)
+ SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
- if (EPI.MemSafetyCheck)
- EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
+ if (MemCheckBlock)
+ MemCheckBlock->getTerminator()->replaceUsesOfWith(
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
DT->changeImmediateDominator(LoopScalarPreHeader,
@@ -7642,10 +7543,10 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
}))
continue;
Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
- if (EPI.SCEVSafetyCheck)
- Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
- if (EPI.MemSafetyCheck)
- Phi->removeIncomingValue(EPI.MemSafetyCheck);
+ if (SCEVCheckBlock)
+ Phi->removeIncomingValue(SCEVCheckBlock);
+ if (MemCheckBlock)
+ Phi->removeIncomingValue(MemCheckBlock);
}
replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
@@ -9380,6 +9281,43 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
VPlanTransforms::runPass(VPlanTransforms::clearReductionWrapFlags, *Plan);
}
+void LoopVectorizationPlanner::attachRuntimeChecks(
+ VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
+ const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
+ if (SCEVCheckBlock) {
+ assert((!CM.OptForSize ||
+ CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
+ "Cannot SCEV check stride or overflow when optimizing for size");
+ VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
+ HasBranchWeights);
+ }
+ const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
+ if (MemCheckBlock) {
+ // VPlan-native path does not do any analysis for runtime checks
+ // currently.
+ assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
+ "Runtime checks are not supported for outer loops yet");
+
+ if (CM.OptForSize) {
+ assert(
+ CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
+ "Cannot emit memory checks when optimizing for size, unless forced "
+ "to vectorize.");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
+ OrigLoop->getStartLoc(),
+ OrigLoop->getHeader())
+ << "Code-size may be reduced by not forcing "
+ "vectorization, or by source-code modifications "
+ "eliminating the need for runtime checks "
+ "(e.g., adding 'restrict').";
+ });
+ }
+ VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock,
+ HasBranchWeights);
+ }
+}
+
void VPDerivedIVRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
@@ -9501,10 +9439,7 @@ static bool processLoopInVPlanNativePath(
VPlan &BestPlan = LVP.getPlanFor(VF.Width);
{
- bool AddBranchWeights =
- hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
- GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
- AddBranchWeights, CM.CostKind);
+ GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
VF.Width, 1, &CM, BFI, PSI, Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -10142,10 +10077,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (ORE->allowExtraAnalysis(LV_NAME))
LVP.emitInvalidCostRemarks(ORE);
- bool AddBranchWeights =
- hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
- GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
- AddBranchWeights, CM.CostKind);
+ GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
if (LVP.hasPlanWithVF(VF.Width)) {
// Select the interleave count.
IC = CM.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index ffa247afa2360..40a55656bfa7e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -487,10 +487,16 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
} else {
// Set each forward successor here when it is created, excluding
// backedges. A backward successor is set when the branch is created.
+ // Branches to VPIRBasicBlocks must have the same successors in VPlan as
+ // in the original IR, except when the predecessor is the entry block.
+ // This enables including SCEV and memory runtime check blocks in VPlan.
+ // TODO: Remove exception by modeling the terminator of entry block using
+ // BranchOnCond.
unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
assert((TermBr && (!TermBr->getSuccessor(idx) ||
(isa<VPIRBasicBlock>(this) &&
- TermBr->getSuccessor(idx) == NewBB))) &&
+ (TermBr->getSuccessor(idx) == NewBB ||
+ PredVPBlock == getPlan()->getEntry())))) &&
"Trying to reset an existing successor block.");
TermBr->setSuccessor(idx, NewBB);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 593e5063802ba..37e15326e01f9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -20,6 +20,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/MDBuilder.h"
#define DEBUG_TYPE "vplan"
@@ -589,3 +590,41 @@ void VPlanTransforms::createLoopRegions(VPlan &Plan) {
TopRegion->setName("vector loop");
TopRegion->getEntryBasicBlock()->setName("vector.body");
}
+
+// Likelyhood of bypassing the vectorized loop due to a runtime check block,
+// including memory overlap checks block and wrapping/unit-stride checks block.
+static constexpr uint32_t CheckBypassWeights[] = {1, 127};
+
+void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
+ BasicBlock *CheckBlock,
+ bool AddBranchWeights) {
+ VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
+ VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock);
+ VPBlockBase *VectorPH = Plan.getVectorPreheader();
+ VPBlockBase *ScalarPH = Plan.getScalarPreheader();
+ VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
+ VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckBlockVPBB);
+ VPBlockUtils::connectBlocks(CheckBlockVPBB, ScalarPH);
+ CheckBlockVPBB->swapSuccessors();
+
+ // We just connected a new block to the scalar preheader. Update all
+ // VPPhis by adding an incoming value for it, replicating the last value.
+ unsigned NumPredecessors = ScalarPH->getNumPredecessors();
+ for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
+ assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
+ assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
+ "must have incoming values for all operands");
+ R.addOperand(R.getOperand(NumPredecessors - 2));
+ }
+
+ VPIRMetadata VPBranchWeights;
+ auto *Term = VPBuilder(CheckBlockVPBB)
+ .createNaryOp(VPInstruction::BranchOnCond, {CondVPV},
+ Plan.getCanonicalIV()->getDebugLoc());
+ if (AddBranchWeights) {
+ MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext());
+ MDNode *BranchWeights =
+ MDB.createBranchWeights(CheckBypassWeights, /*IsExpected=*/false);
+ Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 8d2eded45da22..b42c444f09be8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -74,6 +74,12 @@ struct VPlanTransforms {
/// flat CFG into a hierarchical CFG.
static void createLoopRegions(VPlan &Plan);
+ /// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
+ /// VPValue and connect the block to \p Plan, using the VPValue as branch
+ /// condition.
+ static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock,
+ bool AddBranchWeights);
+
/// Replaces the VPInstructions in \p Plan with corresponding
/// widen recipes. Returns false if any VPInstructions could not be converted
/// to a wide recipe if needed.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index b4e49a60e0887..8ec38912af435 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -210,6 +210,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %8 = or i1 %7, %mul.overflow
; CHECK-NEXT: IR %9 = icmp ugt i64 %3, 4294967295
; CHECK-NEXT: IR %10 = or i1 %8, %9
+; CHECK-NEXT: EMIT branch-on-cond ir<%10>
; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<vector.memcheck>:
@@ -218,6 +219,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %13 = mul i64 %12, 4
; CHECK-NEXT: IR %14 = sub i64 %B1, %A2
; CHECK-NEXT: IR %
diff .check = icmp ult i64 %14, %13
+; CHECK-NEXT: EMIT branch-on-cond ir<%
diff .check>
; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<vector.ph>:
@@ -227,22 +229,22 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf
; CHECK-NEXT: IR %17 = call i64 @llvm.vscale.i64()
; CHECK-NEXT: IR %18 = mul nuw i64 %17, 4
-; CHECK-NEXT: vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
-; CHECK-NEXT: vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
+; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
+; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
; CHECK-NEXT: Successor(s): vector.body
; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%3>, ir<-1>
+; CHECK-NEXT: vp<%5> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
+; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
-; CHECK-NEXT: WIDEN ir<%19> = load vp<%4>
+; CHECK-NEXT: vp<%6> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
+; CHECK-NEXT: WIDEN ir<%19> = load vp<%6>
; CHECK-NEXT: WIDEN ir<%add9> = add ir<%19>, ir<1>
; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
-; CHECK-NEXT: WIDEN store vp<%5>, ir<%add9>
+; CHECK-NEXT: vp<%7> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
+; CHECK-NEXT: WIDEN store vp<%7>, ir<%add9>
; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec>
; CHECK-NEXT: Successor(s): middle.block, vector.body
@@ -256,8 +258,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<scalar.ph>:
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%1>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ]
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%2>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ]
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ]
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%4>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ]
; CHECK-NEXT: Successor(s): ir-bb<for.body>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body>:
@@ -281,10 +283,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %1 = call i64 @llvm.vscale.i64()
; CHECK-NEXT: %2 = mul nuw i64 %1, 4
; CHECK-NEXT: %min.iters.check = icmp ult i64 %0, %2
-; CHECK-NEXT: br i1 %min.iters.check, label %scalar.ph, label %vector.scevcheck
+; CHECK-NEXT: br i1 %min.iters.check, label %scalar.ph, label %vector.ph
; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.scevcheck> in BB: vector.scevcheck
; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.scevcheck: ; preds = %for.body.preheader
+; CHECK-NEXT: vector.scevcheck: ; No predecessors!
; CHECK-NEXT: %3 = add nsw i64 %0, -1
; CHECK-NEXT: %4 = add i32 %n, -1
; CHECK-NEXT: %5 = trunc i64 %3 to i32
@@ -296,21 +298,21 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %8 = or i1 %7, %mul.overflow
; CHECK-NEXT: %9 = icmp ugt i64 %3, 4294967295
; CHECK-NEXT: %10 = or i1 %8, %9
-; CHECK-NEXT: br i1 %10, label %scalar.ph, label %vector.memcheck
+; CHECK-NEXT: br i1 %10, <null operand!>, <null operand!>
; CHECK-NEXT: LV: draw edge from for.body.preheader
; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.memcheck> in BB: vector.memcheck
; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.memcheck: ; preds = %vector.scevcheck
+; CHECK-NEXT: vector.memcheck: ; No predecessors!
; CHECK-NEXT: %11 = call i64 @llvm.vscale.i64()
; CHECK-NEXT: %12 = mul nuw i64 %11, 4
; CHECK-NEXT: %13 = mul i64 %12, 4
; CHECK-NEXT: %14 = sub i64 %B1, %A2
; CHECK-NEXT: %
diff .check = icmp ult i64 %14, %13
-; CHECK-NEXT: br i1 %
diff .check, label %scalar.ph, label %vector.ph
+; CHECK-NEXT: br i1 %
diff .check, <null operand!>, <null operand!>
; CHECK-NEXT: LV: draw edge from vector.scevcheck
; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.ph> in BB: vector.ph
; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.ph: ; preds = %vector.memcheck
+; CHECK-NEXT: vector.ph: ; No predecessors!
; CHECK-NEXT: %15 = call i64 @llvm.vscale.i64()
; CHECK-NEXT: %16 = mul nuw i64 %15, 4
; CHECK-NEXT: %n.mod.vf = urem i64 %0, %16
@@ -366,7 +368,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: draw edge from middle.block
; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<scalar.ph> in BB: scalar.ph
; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader
+; CHECK-NEXT: scalar.ph: ; preds = %for.body.preheader
; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
; CHECK-NEXT: br label %for.body
@@ -621,6 +623,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %8 = or i1 %7, %mul.overflow
; CHECK-NEXT: IR %9 = icmp ugt i64 %3, 4294967295
; CHECK-NEXT: IR %10 = or i1 %8, %9
+; CHECK-NEXT: EMIT branch-on-cond ir<%10>
; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<vector.memcheck>:
@@ -629,6 +632,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %13 = mul i64 %12, 4
; CHECK-NEXT: IR %14 = sub i64 %B1, %A2
; CHECK-NEXT: IR %
diff .check = icmp ult i64 %14, %13
+; CHECK-NEXT: EMIT branch-on-cond ir<%
diff .check>
; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<vector.ph>:
@@ -638,22 +642,22 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf
; CHECK-NEXT: IR %17 = call i64 @llvm.vscale.i64()
; CHECK-NEXT: IR %18 = mul nuw i64 %17, 4
-; CHECK-NEXT: vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
-; CHECK-NEXT: vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
+; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
+; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
; CHECK-NEXT: Successor(s): vector.body
; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%3>, ir<-1>
+; CHECK-NEXT: vp<%5> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
+; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
-; CHECK-NEXT: WIDEN ir<%19> = load vp<%4>
+; CHECK-NEXT: vp<%6> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
+; CHECK-NEXT: WIDEN ir<%19> = load vp<%6>
; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%19>, ir<1.000000e+00>
; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
-; CHECK-NEXT: WIDEN store vp<%5>, ir<%conv1>
+; CHECK-NEXT: vp<%7> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
+; CHECK-NEXT: WIDEN store vp<%7>, ir<%conv1>
; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec>
; CHECK-NEXT: Successor(s): middle.block, vector.body
@@ -667,8 +671,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<scalar.ph>:
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%1>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ]
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%2>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ]
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ]
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%4>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ]
; CHECK-NEXT: Successor(s): ir-bb<for.body>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body>:
@@ -692,10 +696,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %1 = call i64 @llvm.vscale.i64()
; CHECK-NEXT: %2 = mul nuw i64 %1, 4
; CHECK-NEXT: %min.iters.check = icmp ult i64 %0, %2
-; CHECK-NEXT: br i1 %min.iters.check, label %scalar.ph, label %vector.scevcheck
+; CHECK-NEXT: br i1 %min.iters.check, label %scalar.ph, label %vector.ph
; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.scevcheck> in BB: vector.scevcheck
; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.scevcheck: ; preds = %for.body.preheader
+; CHECK-NEXT: vector.scevcheck: ; No predecessors!
; CHECK-NEXT: %3 = add nsw i64 %0, -1
; CHECK-NEXT: %4 = add i32 %n, -1
; CHECK-NEXT: %5 = trunc i64 %3 to i32
@@ -707,21 +711,21 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %8 = or i1 %7, %mul.overflow
; CHECK-NEXT: %9 = icmp ugt i64 %3, 4294967295
; CHECK-NEXT: %10 = or i1 %8, %9
-; CHECK-NEXT: br i1 %10, label %scalar.ph, label %vector.memcheck
+; CHECK-NEXT: br i1 %10, <null operand!>, <null operand!>
; CHECK-NEXT: LV: draw edge from for.body.preheader
; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.memcheck> in BB: vector.memcheck
; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.memcheck: ; preds = %vector.scevcheck
+; CHECK-NEXT: vector.memcheck: ; No predecessors!
; CHECK-NEXT: %11 = call i64 @llvm.vscale.i64()
; CHECK-NEXT: %12 = mul nuw i64 %11, 4
; CHECK-NEXT: %13 = mul i64 %12, 4
; CHECK-NEXT: %14 = sub i64 %B1, %A2
; CHECK-NEXT: %
diff .check = icmp ult i64 %14, %13
-; CHECK-NEXT: br i1 %
diff .check, label %scalar.ph, label %vector.ph
+; CHECK-NEXT: br i1 %
diff .check, <null operand!>, <null operand!>
; CHECK-NEXT: LV: draw edge from vector.scevcheck
; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.ph> in BB: vector.ph
; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.ph: ; preds = %vector.memcheck
+; CHECK-NEXT: vector.ph: ; No predecessors!
; CHECK-NEXT: %15 = call i64 @llvm.vscale.i64()
; CHECK-NEXT: %16 = mul nuw i64 %15, 4
; CHECK-NEXT: %n.mod.vf = urem i64 %0, %16
@@ -777,7 +781,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: draw edge from middle.block
; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<scalar.ph> in BB: scalar.ph
; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader
+; CHECK-NEXT: scalar.ph: ; preds = %for.body.preheader
; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
; CHECK-NEXT: br label %for.body
More information about the llvm-commits
mailing list