[llvm] [VPlan] Connect (MemRuntime|SCEV)Check blocks as VPlan transform (NFC). (PR #143879)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 9 03:52:06 PDT 2025
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/143879
>From 556dac991b05e560e18a00494b5681e01a34ebc4 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 17 Jun 2025 22:51:50 +0100
Subject: [PATCH 1/5] [VPlan] Connect (MemRuntime|SCEV)Check blocks as VPlan
transform (NFC).
Connect SCEV and memory runtime check block directly in VPlan as
VPIRBasicBlocks, removing ILV::emitSCEVChecks and ILV::emitMemRuntimeChecks.
The new logic is currently split across LoopVectorizationPlanner::addRuntimeChecks
which collects a list of {Condition, CheckBlock} pairs and performs some
checks and emits remarks if needed. The list of checks is then added to
VPlan in VPlanTransforms;:connectCheckBlocks.
---
.../Vectorize/LoopVectorizationPlanner.h | 7 +
.../Transforms/Vectorize/LoopVectorize.cpp | 209 +++++++-----------
llvm/lib/Transforms/Vectorize/VPlan.cpp | 3 +-
.../Vectorize/VPlanConstruction.cpp | 39 ++++
.../Transforms/Vectorize/VPlanTransforms.h | 7 +
.../RISCV/riscv-vector-reverse.ll | 82 +++----
6 files changed, 160 insertions(+), 187 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 144f35e10132f..caad61543262b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -28,6 +28,10 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/Support/InstructionCost.h"
+namespace {
+class GeneratedRTChecks;
+}
+
namespace llvm {
class LoopInfo;
@@ -560,6 +564,9 @@ class LoopVectorizationPlanner {
VPRecipeBuilder &RecipeBuilder,
ElementCount MinVF);
+ /// Add the runtime checks from \p RTChecks to \p VPlan.
+ void addRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks) const;
+
#ifndef NDEBUG
/// \return The most profitable vectorization factor for the available VPlans
/// and the cost of that VF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index bb29e4fc6d232..a4848e9185623 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -399,12 +399,6 @@ static cl::opt<bool> EnableEarlyExitVectorization(
cl::desc(
"Enable vectorization of early exit loops with uncountable exits."));
-// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
-// variables not overflowing do not hold. See `emitSCEVChecks`.
-static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
-// Likelyhood of bypassing the vectorized loop because pointers overlap. See
-// `emitMemRuntimeChecks`.
-static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
// Likelyhood of bypassing the vectorized loop because there are zero trips left
// after prolog. See `emitIterationCountCheck`.
static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
@@ -544,16 +538,6 @@ class InnerLoopVectorizer {
/// it overflows.
void emitIterationCountCheck(BasicBlock *Bypass);
- /// Emit a bypass check to see if all of the SCEV assumptions we've
- /// had to make are correct. Returns the block containing the checks or
- /// nullptr if no checks have been added.
- BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
-
- /// Emit bypass checks to check any memory assumptions we may have made.
- /// Returns the block containing the checks or nullptr if no checks have been
- /// added.
- BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
-
/// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
/// vector loop preheader, middle block and scalar preheader.
void createVectorLoopSkeleton(StringRef Prefix);
@@ -1790,7 +1774,6 @@ class GeneratedRTChecks {
SCEVExpander MemCheckExp;
bool CostTooHigh = false;
- const bool AddBranchWeights;
Loop *OuterLoop = nullptr;
@@ -1802,11 +1785,10 @@ class GeneratedRTChecks {
public:
GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
LoopInfo *LI, TargetTransformInfo *TTI,
- const DataLayout &DL, bool AddBranchWeights,
- TTI::TargetCostKind CostKind)
+ const DataLayout &DL, TTI::TargetCostKind CostKind)
: DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
- MemCheckExp(*PSE.getSE(), DL, "scev.check"),
- AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
+ MemCheckExp(*PSE.getSE(), DL, "scev.check"), PSE(PSE),
+ CostKind(CostKind) {}
/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
/// accurately estimate the cost of the runtime checks. The blocks are
@@ -2025,61 +2007,35 @@ class GeneratedRTChecks {
/// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
/// adjusts the branches to branch to the vector preheader or \p Bypass,
/// depending on the generated condition.
- BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
- BasicBlock *LoopVectorPreHeader) {
+ std::pair<Value *, BasicBlock *> emitSCEVChecks() {
using namespace llvm::PatternMatch;
if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt()))
- return nullptr;
+ return {nullptr, nullptr};
- auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
- BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
-
- SCEVCheckBlock->getTerminator()->eraseFromParent();
- SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
- Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
- SCEVCheckBlock);
-
- BranchInst &BI =
- *BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond);
- if (AddBranchWeights)
- setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
- ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
- // Mark the check as used, to prevent it from being removed during cleanup.
+ Value *Cond = SCEVCheckCond;
SCEVCheckCond = nullptr;
AddedAnyChecks = true;
- return SCEVCheckBlock;
+ return {Cond, SCEVCheckBlock};
}
/// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
/// the branches to branch to the vector preheader or \p Bypass, depending on
/// the generated condition.
- BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
- BasicBlock *LoopVectorPreHeader) {
+ std::pair<Value *, BasicBlock *> emitMemRuntimeChecks() {
// Check if we generated code that checks in runtime if arrays overlap.
if (!MemRuntimeCheckCond)
- return nullptr;
-
- auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
- Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
- MemCheckBlock);
-
- MemCheckBlock->moveBefore(LoopVectorPreHeader);
-
- BranchInst &BI =
- *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
- if (AddBranchWeights) {
- setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
- }
- ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
- MemCheckBlock->getTerminator()->setDebugLoc(
- Pred->getTerminator()->getDebugLoc());
+ return {nullptr, nullptr};
// Mark the check as used, to prevent it from being removed during cleanup.
+ Value *Cond = MemRuntimeCheckCond;
MemRuntimeCheckCond = nullptr;
AddedAnyChecks = true;
- return MemCheckBlock;
+ return {Cond, MemCheckBlock};
}
+ BasicBlock *getSCEVCheckBlock() const { return SCEVCheckBlock; }
+ BasicBlock *getMemCheckBlock() const { return MemCheckBlock; }
+
/// Return true if any runtime checks have been added
bool hasChecks() const { return AddedAnyChecks; }
};
@@ -2466,53 +2422,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
"Plan's entry must be TCCCheckBlock");
}
-BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
- BasicBlock *const SCEVCheckBlock =
- RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
- if (!SCEVCheckBlock)
- return nullptr;
-
- assert((!Cost->OptForSize ||
- Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
- "Cannot SCEV check stride or overflow when optimizing for size");
-
- introduceCheckBlockInVPlan(SCEVCheckBlock);
- return SCEVCheckBlock;
-}
-
-BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
- BasicBlock *const MemCheckBlock =
- RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
-
- // Check if we generated code that checks in runtime if arrays overlap. We put
- // the checks into a separate block to make the more common case of few
- // elements faster.
- if (!MemCheckBlock)
- return nullptr;
-
- // VPlan-native path does not do any analysis for runtime checks currently.
- assert((!EnableVPlanNativePath || OrigLoop->begin() == OrigLoop->end()) &&
- "Runtime checks are not supported for outer loops yet");
-
- if (Cost->OptForSize) {
- assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
- "Cannot emit memory checks when optimizing for size, unless forced "
- "to vectorize.");
- ORE->emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
- OrigLoop->getStartLoc(),
- OrigLoop->getHeader())
- << "Code-size may be reduced by not forcing "
- "vectorization, or by source-code modifications "
- "eliminating the need for runtime checks "
- "(e.g., adding 'restrict').";
- });
- }
-
- introduceCheckBlockInVPlan(MemCheckBlock);
- return MemCheckBlock;
-}
-
/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
@@ -2629,15 +2538,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// to the scalar loop.
emitIterationCountCheck(LoopScalarPreHeader);
- // Generate the code to check any assumptions that we've made for SCEV
- // expressions.
- emitSCEVChecks(LoopScalarPreHeader);
-
- // Generate the code that checks in runtime if arrays overlap. We put the
- // checks into a separate block to make the more common case of few elements
- // faster.
- emitMemRuntimeChecks(LoopScalarPreHeader);
-
replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
return LoopVectorPreHeader;
}
@@ -7333,6 +7233,11 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator,
BestVPlan, BestVF);
+
+ if (!VectorizingEpilogue)
+ addRuntimeChecks(BestVPlan, ILV.RTChecks);
+
+ VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
VPlanTransforms::narrowInterleaveGroups(
@@ -7380,7 +7285,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// 1. Set up the skeleton for vectorization, including vector pre-header and
// middle block. The vector loop is created during VPlan execution.
- VPBasicBlock *VectorPH = cast<VPBasicBlock>(Entry->getSuccessors()[1]);
+ BasicBlock *EntryBB =
+ cast<VPIRBasicBlock>(BestVPlan.getEntry())->getIRBasicBlock();
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
if (VectorizingEpilogue)
VPlanTransforms::removeDeadRecipes(BestVPlan);
@@ -7404,6 +7310,12 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
+ // Move check blocks to their final position.
+ if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemCheckBlock())
+ MemCheckBlock->moveAfter(EntryBB);
+ if (BasicBlock *SCEVCheckBlock = ILV.RTChecks.getSCEVCheckBlock())
+ SCEVCheckBlock->moveAfter(EntryBB);
+
BestVPlan.execute(&State);
// 2.5 When vectorizing the epilogue, fix reduction resume values from the
@@ -7504,15 +7416,6 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
emitIterationCountCheck(LoopScalarPreHeader, true);
EPI.EpilogueIterationCountCheck->setName("iter.check");
- // Generate the code to check any assumptions that we've made for SCEV
- // expressions.
- EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
-
- // Generate the code that checks at runtime if arrays overlap. We put the
- // checks into a separate block to make the more common case of few elements
- // faster.
- EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
-
// Generate the iteration count check for the main loop, *after* the check
// for the epilogue loop, so that the path-length is shorter for the case
// that goes directly through the vector epilogue. The longer-path length for
@@ -7616,6 +7519,13 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+ BasicBlock *SCEVCheckBlock = RTChecks.getSCEVCheckBlock();
+ if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessorsOrMore(1))
+ EPI.SCEVSafetyCheck = SCEVCheckBlock;
+
+ BasicBlock *MemCheckBlock = RTChecks.getMemCheckBlock();
+ if (MemCheckBlock && MemCheckBlock->hasNPredecessorsOrMore(1))
+ EPI.MemSafetyCheck = MemCheckBlock;
if (EPI.SCEVSafetyCheck)
EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
@@ -9353,6 +9263,47 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
VPlanTransforms::runPass(VPlanTransforms::clearReductionWrapFlags, *Plan);
}
+void LoopVectorizationPlanner::addRuntimeChecks(
+ VPlan &Plan, GeneratedRTChecks &RTChecks) const {
+ SmallVector<std::pair<VPValue *, VPIRBasicBlock *>> Checks;
+ const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.emitSCEVChecks();
+ if (SCEVCheckBlock) {
+ assert((!CM.OptForSize ||
+ CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
+ "Cannot SCEV check stride or overflow when optimizing for size");
+ Checks.emplace_back(Plan.getOrAddLiveIn(SCEVCheckCond),
+ Plan.createVPIRBasicBlock(SCEVCheckBlock));
+ }
+ const auto &[MemCheckCond, MemCheckBlock] = RTChecks.emitMemRuntimeChecks();
+ if (MemCheckBlock) {
+ // VPlan-native path does not do any analysis for runtime checks
+ // currently.
+ assert((!EnableVPlanNativePath || OrigLoop->begin() == OrigLoop->end()) &&
+ "Runtime checks are not supported for outer loops yet");
+
+ if (CM.OptForSize) {
+ assert(
+ CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
+ "Cannot emit memory checks when optimizing for size, unless forced "
+ "to vectorize.");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
+ OrigLoop->getStartLoc(),
+ OrigLoop->getHeader())
+ << "Code-size may be reduced by not forcing "
+ "vectorization, or by source-code modifications "
+ "eliminating the need for runtime checks "
+ "(e.g., adding 'restrict').";
+ });
+ }
+ Checks.emplace_back(Plan.getOrAddLiveIn(MemCheckCond),
+ Plan.createVPIRBasicBlock(MemCheckBlock));
+ }
+ VPlanTransforms::connectCheckBlocks(
+ Plan, Checks,
+ hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()));
+}
+
void VPDerivedIVRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
@@ -9474,10 +9425,7 @@ static bool processLoopInVPlanNativePath(
VPlan &BestPlan = LVP.getPlanFor(VF.Width);
{
- bool AddBranchWeights =
- hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
- GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
- AddBranchWeights, CM.CostKind);
+ GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
VF.Width, 1, &CM, BFI, PSI, Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -10116,10 +10064,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (ORE->allowExtraAnalysis(LV_NAME))
LVP.emitInvalidCostRemarks(ORE);
- bool AddBranchWeights =
- hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
- GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
- AddBranchWeights, CM.CostKind);
+ GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
if (LVP.hasPlanWithVF(VF.Width)) {
// Select the interleave count.
IC = CM.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 63ac80698643d..002f4ef2c000b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -490,7 +490,8 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
assert((TermBr && (!TermBr->getSuccessor(idx) ||
(isa<VPIRBasicBlock>(this) &&
- TermBr->getSuccessor(idx) == NewBB))) &&
+ (TermBr->getSuccessor(idx) == NewBB ||
+ PredVPBlock == getPlan()->getEntry())))) &&
"Trying to reset an existing successor block.");
TermBr->setSuccessor(idx, NewBB);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 593e5063802ba..a55e95ef274b7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -20,6 +20,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/MDBuilder.h"
#define DEBUG_TYPE "vplan"
@@ -589,3 +590,41 @@ void VPlanTransforms::createLoopRegions(VPlan &Plan) {
TopRegion->setName("vector loop");
TopRegion->getEntryBasicBlock()->setName("vector.body");
}
+
+// Likelyhood of bypassing the vectorized loop because SCEV assumptions or
+// memory runtime checks.
+static constexpr uint32_t CheckBypassWeights[] = {1, 127};
+
+void VPlanTransforms::connectCheckBlocks(
+ VPlan &Plan, ArrayRef<std::pair<VPValue *, VPIRBasicBlock *>> Checks,
+ bool AddBranchWeights) {
+ VPBlockBase *VectorPH = Plan.getVectorPreheader();
+ VPBlockBase *ScalarPH = Plan.getScalarPreheader();
+ for (const auto &[Cond, CheckBlock] : Checks) {
+ VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
+ VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckBlock);
+ VPBlockUtils::connectBlocks(CheckBlock, ScalarPH);
+ CheckBlock->swapSuccessors();
+
+ // We just connected a new block to the scalar preheader. Update all
+ // VPPhis by adding an incoming value for it, replicating the last value.
+ unsigned NumPredecessors = ScalarPH->getNumPredecessors();
+ for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
+ assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
+ assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
+ "must have incoming values for all operands");
+ R.addOperand(R.getOperand(NumPredecessors - 2));
+ }
+
+ VPIRMetadata VPBranchWeights;
+ auto *Term = VPBuilder(CheckBlock)
+ .createNaryOp(VPInstruction::BranchOnCond, {Cond},
+ Plan.getCanonicalIV()->getDebugLoc());
+ if (AddBranchWeights) {
+ MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext());
+ MDNode *BranchWeights =
+ MDB.createBranchWeights(CheckBypassWeights, /*IsExpected=*/false);
+ Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
+ }
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 40885cd52a127..515f656d4be3c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -74,6 +74,13 @@ struct VPlanTransforms {
/// flat CFG into a hierarchical CFG.
static void createLoopRegions(VPlan &Plan);
+ /// Connect the blocks in \p Checks to \p Plan, using the corresponding
+ /// VPValue as branch condition.
+ static void
+ connectCheckBlocks(VPlan &Plan,
+ ArrayRef<std::pair<VPValue *, VPIRBasicBlock *>> Checks,
+ bool AddBranchWeights);
+
/// Replaces the VPInstructions in \p Plan with corresponding
/// widen recipes. Returns false if any VPInstructions could not be converted
/// to a wide recipe if needed.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index dd8b7d6ea7e42..002fae6827557 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -210,6 +210,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %8 = or i1 %7, %mul.overflow
; CHECK-NEXT: IR %9 = icmp ugt i64 %3, 4294967295
; CHECK-NEXT: IR %10 = or i1 %8, %9
+; CHECK-NEXT: EMIT branch-on-cond ir<%10>
; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<vector.memcheck>:
@@ -218,6 +219,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %13 = mul i64 %12, 4
; CHECK-NEXT: IR %14 = sub i64 %B1, %A2
; CHECK-NEXT: IR %diff.check = icmp ult i64 %14, %13
+; CHECK-NEXT: EMIT branch-on-cond ir<%diff.check>
; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<vector.ph>:
@@ -227,22 +229,22 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf
; CHECK-NEXT: IR %17 = call i64 @llvm.vscale.i64()
; CHECK-NEXT: IR %18 = mul nuw i64 %17, 4
-; CHECK-NEXT: vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
-; CHECK-NEXT: vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
+; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
+; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
; CHECK-NEXT: Successor(s): vector.body
; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%3>, ir<-1>
+; CHECK-NEXT: vp<%5> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
+; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
-; CHECK-NEXT: WIDEN ir<%19> = load vp<%4>
+; CHECK-NEXT: vp<%6> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
+; CHECK-NEXT: WIDEN ir<%19> = load vp<%6>
; CHECK-NEXT: WIDEN ir<%add9> = add ir<%19>, ir<1>
; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
-; CHECK-NEXT: WIDEN store vp<%5>, ir<%add9>
+; CHECK-NEXT: vp<%7> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
+; CHECK-NEXT: WIDEN store vp<%7>, ir<%add9>
; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec>
; CHECK-NEXT: Successor(s): middle.block, vector.body
@@ -256,8 +258,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<scalar.ph>:
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%1>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ]
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%2>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ]
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ]
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%4>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ]
; CHECK-NEXT: Successor(s): ir-bb<for.body>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body>:
@@ -284,7 +286,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: br i1 %min.iters.check, label %scalar.ph, label %vector.scevcheck
; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.scevcheck> in BB: vector.scevcheck
; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.scevcheck: ; preds = %for.body.preheader
+; CHECK-NEXT: vector.scevcheck: ; No predecessors!
; CHECK-NEXT: %3 = add nsw i64 %0, -1
; CHECK-NEXT: %4 = add i32 %n, -1
; CHECK-NEXT: %5 = trunc i64 %3 to i32
@@ -296,25 +298,13 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %8 = or i1 %7, %mul.overflow
; CHECK-NEXT: %9 = icmp ugt i64 %3, 4294967295
; CHECK-NEXT: %10 = or i1 %8, %9
-; CHECK-NEXT: br i1 %10, label %scalar.ph, label %vector.memcheck
-; CHECK-NEXT: LV: draw edge from for.body.preheader
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.memcheck> in BB: vector.memcheck
; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.memcheck: ; preds = %vector.scevcheck
+; CHECK-NEXT: vector.memcheck: ; No predecessors!
; CHECK-NEXT: %11 = call i64 @llvm.vscale.i64()
; CHECK-NEXT: %12 = mul nuw i64 %11, 4
; CHECK-NEXT: %13 = mul i64 %12, 4
; CHECK-NEXT: %14 = sub i64 %B1, %A2
; CHECK-NEXT: %diff.check = icmp ult i64 %14, %13
-; CHECK-NEXT: br i1 %diff.check, label %scalar.ph, label %vector.ph
-; CHECK-NEXT: LV: draw edge from vector.scevcheck
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.ph> in BB: vector.ph
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.ph: ; preds = %vector.memcheck
-; CHECK-NEXT: %15 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: %16 = mul nuw i64 %15, 4
-; CHECK-NEXT: %n.mod.vf = urem i64 %0, %16
-; CHECK-NEXT: %n.vec = sub i64 %0, %n.mod.vf
; CHECK-NEXT: %17 = call i64 @llvm.vscale.i64()
; CHECK-NEXT: %18 = mul nuw i64 %17, 4
; CHECK-NEXT: %19 = sub i64 %0, %n.vec
@@ -364,7 +354,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: draw edge from middle.block
; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<scalar.ph> in BB: scalar.ph
; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader
+; CHECK-NEXT: scalar.ph: ; preds = %for.body.preheader
; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
; CHECK-NEXT: br label %for.body
@@ -619,6 +609,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %8 = or i1 %7, %mul.overflow
; CHECK-NEXT: IR %9 = icmp ugt i64 %3, 4294967295
; CHECK-NEXT: IR %10 = or i1 %8, %9
+; CHECK-NEXT: EMIT branch-on-cond ir<%10>
; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<vector.memcheck>:
@@ -627,6 +618,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %13 = mul i64 %12, 4
; CHECK-NEXT: IR %14 = sub i64 %B1, %A2
; CHECK-NEXT: IR %diff.check = icmp ult i64 %14, %13
+; CHECK-NEXT: EMIT branch-on-cond ir<%diff.check>
; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<vector.ph>:
@@ -635,23 +627,20 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %n.mod.vf = urem i64 %0, %16
; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf
; CHECK-NEXT: IR %17 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: IR %18 = mul nuw i64 %17, 4
-; CHECK-NEXT: vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
-; CHECK-NEXT: vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
; CHECK-NEXT: Successor(s): vector.body
; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%3>, ir<-1>
+; CHECK-NEXT: vp<%5> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
+; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
-; CHECK-NEXT: WIDEN ir<%19> = load vp<%4>
+; CHECK-NEXT: vp<%6> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
+; CHECK-NEXT: WIDEN ir<%19> = load vp<%6>
; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%19>, ir<1.000000e+00>
; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
-; CHECK-NEXT: WIDEN store vp<%5>, ir<%conv1>
+; CHECK-NEXT: vp<%7> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
+; CHECK-NEXT: WIDEN store vp<%7>, ir<%conv1>
; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec>
; CHECK-NEXT: Successor(s): middle.block, vector.body
@@ -665,8 +654,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<scalar.ph>:
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%1>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ]
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%2>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ]
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ]
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%4>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ]
; CHECK-NEXT: Successor(s): ir-bb<for.body>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body>:
@@ -690,10 +679,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %1 = call i64 @llvm.vscale.i64()
; CHECK-NEXT: %2 = mul nuw i64 %1, 4
; CHECK-NEXT: %min.iters.check = icmp ult i64 %0, %2
-; CHECK-NEXT: br i1 %min.iters.check, label %scalar.ph, label %vector.scevcheck
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.scevcheck> in BB: vector.scevcheck
; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.scevcheck: ; preds = %for.body.preheader
+; CHECK-NEXT: vector.scevcheck: ; No predecessors!
; CHECK-NEXT: %3 = add nsw i64 %0, -1
; CHECK-NEXT: %4 = add i32 %n, -1
; CHECK-NEXT: %5 = trunc i64 %3 to i32
@@ -705,21 +692,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %8 = or i1 %7, %mul.overflow
; CHECK-NEXT: %9 = icmp ugt i64 %3, 4294967295
; CHECK-NEXT: %10 = or i1 %8, %9
-; CHECK-NEXT: br i1 %10, label %scalar.ph, label %vector.memcheck
-; CHECK-NEXT: LV: draw edge from for.body.preheader
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.memcheck> in BB: vector.memcheck
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.memcheck: ; preds = %vector.scevcheck
-; CHECK-NEXT: %11 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: %12 = mul nuw i64 %11, 4
-; CHECK-NEXT: %13 = mul i64 %12, 4
-; CHECK-NEXT: %14 = sub i64 %B1, %A2
-; CHECK-NEXT: %diff.check = icmp ult i64 %14, %13
-; CHECK-NEXT: br i1 %diff.check, label %scalar.ph, label %vector.ph
-; CHECK-NEXT: LV: draw edge from vector.scevcheck
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.ph> in BB: vector.ph
; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.ph: ; preds = %vector.memcheck
+; CHECK-NEXT: vector.ph: ; No predecessors!
; CHECK-NEXT: %15 = call i64 @llvm.vscale.i64()
; CHECK-NEXT: %16 = mul nuw i64 %15, 4
; CHECK-NEXT: %n.mod.vf = urem i64 %0, %16
@@ -773,7 +747,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: draw edge from middle.block
; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<scalar.ph> in BB: scalar.ph
; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader
+; CHECK-NEXT: scalar.ph: ; preds = %for.body.preheader
; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
; CHECK-NEXT: br label %for.body
>From 044d73ac383da7cc1db481f8f1902733c1904287 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 27 Jun 2025 10:44:27 +0100
Subject: [PATCH 2/5] !fixup address latest comments, thanks
---
.../Transforms/Vectorize/LoopVectorize.cpp | 34 ++++++++++---------
llvm/lib/Transforms/Vectorize/VPlan.cpp | 5 +++
.../Vectorize/VPlanConstruction.cpp | 4 +--
3 files changed, 25 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b65a21a0e6a9b..8641b510667ed 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -641,8 +641,6 @@ struct EpilogueLoopVectorizationInfo {
unsigned EpilogueUF = 0;
BasicBlock *MainLoopIterationCountCheck = nullptr;
BasicBlock *EpilogueIterationCountCheck = nullptr;
- BasicBlock *SCEVSafetyCheck = nullptr;
- BasicBlock *MemSafetyCheck = nullptr;
Value *TripCount = nullptr;
Value *VectorTripCount = nullptr;
VPlan &EpiloguePlan;
@@ -7514,18 +7512,21 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+ // Retrieve blocks with SCEV and memory runtime checks, if they have been
+ // connected to the CFG, otherwise they are unused and will be deleted. Their
+ // terminators and phis using them need adjusting below.
BasicBlock *SCEVCheckBlock = RTChecks.getSCEVCheckBlock();
- if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessorsOrMore(1))
- EPI.SCEVSafetyCheck = SCEVCheckBlock;
-
+ if (SCEVCheckBlock && pred_empty(SCEVCheckBlock))
+ SCEVCheckBlock = nullptr;
BasicBlock *MemCheckBlock = RTChecks.getMemCheckBlock();
- if (MemCheckBlock && MemCheckBlock->hasNPredecessorsOrMore(1))
- EPI.MemSafetyCheck = MemCheckBlock;
- if (EPI.SCEVSafetyCheck)
- EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
+ if (MemCheckBlock && pred_empty(MemCheckBlock))
+ MemCheckBlock = nullptr;
+
+ if (SCEVCheckBlock)
+ SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
- if (EPI.MemSafetyCheck)
- EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
+ if (MemCheckBlock)
+ MemCheckBlock->getTerminator()->replaceUsesOfWith(
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
DT->changeImmediateDominator(LoopScalarPreHeader,
@@ -7552,10 +7553,10 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
}))
continue;
Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
- if (EPI.SCEVSafetyCheck)
- Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
- if (EPI.MemSafetyCheck)
- Phi->removeIncomingValue(EPI.MemSafetyCheck);
+ if (SCEVCheckBlock)
+ Phi->removeIncomingValue(SCEVCheckBlock);
+ if (MemCheckBlock)
+ Phi->removeIncomingValue(MemCheckBlock);
}
replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
@@ -9273,7 +9274,7 @@ void LoopVectorizationPlanner::addRuntimeChecks(
if (MemCheckBlock) {
// VPlan-native path does not do any analysis for runtime checks
// currently.
- assert((!EnableVPlanNativePath || OrigLoop->begin() == OrigLoop->end()) &&
+ assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
"Runtime checks are not supported for outer loops yet");
if (CM.OptForSize) {
@@ -9294,6 +9295,7 @@ void LoopVectorizationPlanner::addRuntimeChecks(
Checks.emplace_back(Plan.getOrAddLiveIn(MemCheckCond),
Plan.createVPIRBasicBlock(MemCheckBlock));
}
+
VPlanTransforms::connectCheckBlocks(
Plan, Checks,
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()));
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 0abd936e28398..3b629b42e2752 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -487,6 +487,11 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
} else {
// Set each forward successor here when it is created, excluding
// backedges. A backward successor is set when the branch is created.
+ // Branches to VPIRBasicBlocks must have the same successors in VPlan as
+ // in the original IR, except if the predecessors is the entry block. This
+ // enables including SCEV and memory runtime check blocks in VPlan.
+ // TODO: Remove exception by modeling branch in the entry block using
+ // BranchOnCond.
unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
assert((TermBr && (!TermBr->getSuccessor(idx) ||
(isa<VPIRBasicBlock>(this) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index a55e95ef274b7..1f594260c4426 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -591,8 +591,8 @@ void VPlanTransforms::createLoopRegions(VPlan &Plan) {
TopRegion->getEntryBasicBlock()->setName("vector.body");
}
-// Likelyhood of bypassing the vectorized loop because SCEV assumptions or
-// memory runtime checks.
+// Likelyhood of bypassing the vectorized loop due to SCEV or memory runtime
+// checks.
static constexpr uint32_t CheckBypassWeights[] = {1, 127};
void VPlanTransforms::connectCheckBlocks(
>From 40471dafd079564381d417db2c25e9193bd46bbd Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 5 Jul 2025 22:03:38 +0100
Subject: [PATCH 3/5] !fixup address comments, thanks
---
.../Vectorize/LoopVectorizationPlanner.h | 5 +-
.../Transforms/Vectorize/LoopVectorize.cpp | 335 ++++++++++--------
llvm/lib/Transforms/Vectorize/VPlan.cpp | 6 +-
.../Vectorize/VPlanConstruction.cpp | 24 +-
.../Transforms/Vectorize/VPlanTransforms.h | 15 +-
.../RISCV/riscv-vector-reverse.ll | 100 ++++--
6 files changed, 278 insertions(+), 207 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index caad61543262b..aa30dc7d8eebc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -564,8 +564,9 @@ class LoopVectorizationPlanner {
VPRecipeBuilder &RecipeBuilder,
ElementCount MinVF);
- /// Add the runtime checks from \p RTChecks to \p VPlan.
- void addRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks) const;
+ /// Attach the runtime checks of \p RTChecks to \p Plan.
+ void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks,
+ bool HasBranchWeights) const;
#ifndef NDEBUG
/// \return The most profitable vectorization factor for the available VPlans
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8641b510667ed..45f503f627ae9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -395,7 +395,7 @@ static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
cl::desc("Try wider VFs if they enable the use of vector variants"));
static cl::opt<bool> EnableEarlyExitVectorization(
- "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
+ "enable-early-exit-vectorization", cl::init(true), cl::Hidden,
cl::desc(
"Enable vectorization of early exit loops with uncountable exits."));
@@ -2002,10 +2002,9 @@ class GeneratedRTChecks {
MemCheckBlock->eraseFromParent();
}
- /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
- /// adjusts the branches to branch to the vector preheader or \p Bypass,
- /// depending on the generated condition.
- std::pair<Value *, BasicBlock *> emitSCEVChecks() {
+ /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
+ /// outside VPlan.
+ std::pair<Value *, BasicBlock *> getSCEVChecks() {
using namespace llvm::PatternMatch;
if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt()))
return {nullptr, nullptr};
@@ -2014,21 +2013,15 @@ class GeneratedRTChecks {
return {SCEVCheckCond , SCEVCheckBlock};
}
- /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
- /// the branches to branch to the vector preheader or \p Bypass, depending on
- /// the generated condition.
- std::pair<Value *, BasicBlock *> emitMemRuntimeChecks() {
+ /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
+ /// outside VPlan.
+ std::pair<Value *, BasicBlock *> getMemRuntimeChecks() {
// Check if we generated code that checks in runtime if arrays overlap.
- if (!MemRuntimeCheckCond)
- return {nullptr, nullptr};
-
- AddedAnyChecks = true;
+ if (MemRuntimeCheckCond)
+ AddedAnyChecks = true;
return {MemRuntimeCheckCond, MemCheckBlock};
}
- BasicBlock *getSCEVCheckBlock() const { return SCEVCheckBlock; }
- BasicBlock *getMemCheckBlock() const { return MemCheckBlock; }
-
/// Return true if any runtime checks have been added
bool hasChecks() const { return AddedAnyChecks; }
};
@@ -4819,7 +4812,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
const RecurrenceDescriptor &RdxDesc = Reduction.second;
RecurKind RK = RdxDesc.getRecurrenceKind();
return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
- RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
+ RecurrenceDescriptor::isFindIVRecurrenceKind(RK);
});
if (HasSelectCmpReductions) {
LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
@@ -5962,24 +5955,23 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
auto *SE = PSE.getSE();
- auto HasSingleCopyAfterVectorization = [this](Instruction *I,
- ElementCount VF) -> bool {
- if (VF.isScalar())
- return true;
-
- auto Scalarized = InstsToScalarize.find(VF);
- assert(Scalarized != InstsToScalarize.end() &&
- "VF not yet analyzed for scalarization profitability");
- return !Scalarized->second.count(I) &&
- llvm::all_of(I->users(), [&](User *U) {
- auto *UI = cast<Instruction>(U);
- return !Scalarized->second.count(UI);
- });
- };
- (void)HasSingleCopyAfterVectorization;
-
Type *VectorTy;
if (isScalarAfterVectorization(I, VF)) {
+ [[maybe_unused]] auto HasSingleCopyAfterVectorization =
+ [this](Instruction *I, ElementCount VF) -> bool {
+ if (VF.isScalar())
+ return true;
+
+ auto Scalarized = InstsToScalarize.find(VF);
+ assert(Scalarized != InstsToScalarize.end() &&
+ "VF not yet analyzed for scalarization profitability");
+ return !Scalarized->second.count(I) &&
+ llvm::all_of(I->users(), [&](User *U) {
+ auto *UI = cast<Instruction>(U);
+ return !Scalarized->second.count(UI);
+ });
+ };
+
// With the exception of GEPs and PHIs, after scalarization there should
// only be one copy of the instruction generated in the loop. This is
// because the VF is either 1, or any instructions that need scalarizing
@@ -6239,8 +6231,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
Type *ValTy = I->getOperand(0)->getType();
if (canTruncateToMinimalBitwidth(I, VF)) {
- Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
- (void)Op0AsInstruction;
+ [[maybe_unused]] Instruction *Op0AsInstruction =
+ dyn_cast<Instruction>(I->getOperand(0));
assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
MinBWs[I] == MinBWs[Op0AsInstruction]) &&
"if both the operand and the compare are marked for "
@@ -7138,8 +7130,8 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) {
static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) {
using namespace VPlanPatternMatch;
- assert(RdxResult->getOpcode() == VPInstruction::ComputeFindLastIVResult &&
- "RdxResult must be ComputeFindLastIVResult");
+ assert(RdxResult->getOpcode() == VPInstruction::ComputeFindIVResult &&
+ "RdxResult must be ComputeFindIVResult");
VPValue *StartVPV = RdxResult->getOperand(1);
match(StartVPV, m_Freeze(m_VPValue(StartVPV)));
return StartVPV->getLiveInIRValue();
@@ -7153,11 +7145,14 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
// Get the VPInstruction computing the reduction result in the middle block.
// The first operand may not be from the middle block if it is not connected
// to the scalar preheader. In that case, there's nothing to fix.
- auto *EpiRedResult = dyn_cast<VPInstruction>(EpiResumePhiR->getOperand(0));
+ VPValue *Incoming = EpiResumePhiR->getOperand(0);
+ match(Incoming, VPlanPatternMatch::m_ZExtOrSExt(
+ VPlanPatternMatch::m_VPValue(Incoming)));
+ auto *EpiRedResult = dyn_cast<VPInstruction>(Incoming);
if (!EpiRedResult ||
(EpiRedResult->getOpcode() != VPInstruction::ComputeAnyOfResult &&
EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult &&
- EpiRedResult->getOpcode() != VPInstruction::ComputeFindLastIVResult))
+ EpiRedResult->getOpcode() != VPInstruction::ComputeFindIVResult))
return;
auto *EpiRedHeaderPhi =
@@ -7174,8 +7169,8 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
RdxDesc.getRecurrenceKind())) {
- Value *StartV = EpiRedResult->getOperand(1)->getLiveInIRValue();
- (void)StartV;
+ [[maybe_unused]] Value *StartV =
+ EpiRedResult->getOperand(1)->getLiveInIRValue();
auto *Cmp = cast<ICmpInst>(MainResumeValue);
assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
"AnyOf expected to start with ICMP_NE");
@@ -7183,13 +7178,13 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
"AnyOf expected to start by comparing main resume value to original "
"start value");
MainResumeValue = Cmp->getOperand(0);
- } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
+ } else if (RecurrenceDescriptor::isFindIVRecurrenceKind(
RdxDesc.getRecurrenceKind())) {
Value *StartV = getStartValueFromReductionResult(EpiRedResult);
Value *SentinelV = EpiRedResult->getOperand(2)->getLiveInIRValue();
using namespace llvm::PatternMatch;
Value *Cmp, *OrigResumeV, *CmpOp;
- bool IsExpectedPattern =
+ [[maybe_unused]] bool IsExpectedPattern =
match(MainResumeValue,
m_Select(m_OneUse(m_Value(Cmp)), m_Specific(SentinelV),
m_Value(OrigResumeV))) &&
@@ -7197,7 +7192,6 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
m_Value(CmpOp))) &&
((CmpOp == StartV && isGuaranteedNotToBeUndefOrPoison(CmpOp))));
assert(IsExpectedPattern && "Unexpected reduction resume pattern");
- (void)IsExpectedPattern;
MainResumeValue = OrigResumeV;
}
PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
@@ -7223,13 +7217,21 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
OrigLoop->getHeader()->getContext());
VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
- if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
+ bool HasBranchWeights =
+ hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
+ if (HasBranchWeights) {
+ std::optional<unsigned> VScale = CM.getVScaleForTuning();
VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator,
- BestVPlan, BestVF);
+ BestVPlan, BestVF, VScale);
+ }
- if (!VectorizingEpilogue)
- addRuntimeChecks(BestVPlan, ILV.RTChecks);
+ if (!VectorizingEpilogue) {
+ // Checks are the same for all VPlans, added to BestVPlan only for
+ // compactness.
+ attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
+ }
+ // Retrieving VectorPH now when it's easier while VPlan still has Regions.
VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
@@ -7304,9 +7306,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
// Move check blocks to their final position.
- if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemCheckBlock())
+ // TODO: Move as part of VPIRBB execute and update impacted tests.
+ if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemRuntimeChecks().second)
MemCheckBlock->moveAfter(EntryBB);
- if (BasicBlock *SCEVCheckBlock = ILV.RTChecks.getSCEVCheckBlock())
+ if (BasicBlock *SCEVCheckBlock = ILV.RTChecks.getSCEVChecks().second)
SCEVCheckBlock->moveAfter(EntryBB);
BestVPlan.execute(&State);
@@ -7515,13 +7518,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
// Retrieve blocks with SCEV and memory runtime checks, if they have been
// connected to the CFG, otherwise they are unused and will be deleted. Their
// terminators and phis using them need adjusting below.
- BasicBlock *SCEVCheckBlock = RTChecks.getSCEVCheckBlock();
- if (SCEVCheckBlock && pred_empty(SCEVCheckBlock))
- SCEVCheckBlock = nullptr;
- BasicBlock *MemCheckBlock = RTChecks.getMemCheckBlock();
- if (MemCheckBlock && pred_empty(MemCheckBlock))
- MemCheckBlock = nullptr;
-
+ BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second;
+ BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second;
if (SCEVCheckBlock)
SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
@@ -7676,8 +7674,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
(CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
? GEPNoWrapFlags::none()
: GEPNoWrapFlags::inBounds();
- VectorPtr = new VPVectorEndPointerRecipe(
- Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
+ VectorPtr =
+ new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
+ /*Stride*/ -1, Flags, I->getDebugLoc());
} else {
VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
GEP ? GEP->getNoWrapFlags()
@@ -8055,15 +8054,15 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
// something that isn't another partial reduction. This is because the
// extends are intended to be lowered along with the reduction itself.
- // Build up a set of partial reduction bin ops for efficient use checking.
- SmallSet<User *, 4> PartialReductionBinOps;
+ // Build up a set of partial reduction ops for efficient use checking.
+ SmallSet<User *, 4> PartialReductionOps;
for (const auto &[PartialRdx, _] : PartialReductionChains)
- PartialReductionBinOps.insert(PartialRdx.BinOp);
+ PartialReductionOps.insert(PartialRdx.ExtendUser);
auto ExtendIsOnlyUsedByPartialReductions =
- [&PartialReductionBinOps](Instruction *Extend) {
+ [&PartialReductionOps](Instruction *Extend) {
return all_of(Extend->users(), [&](const User *U) {
- return PartialReductionBinOps.contains(U);
+ return PartialReductionOps.contains(U);
});
};
@@ -8072,7 +8071,7 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
for (auto Pair : PartialReductionChains) {
PartialReductionChain Chain = Pair.first;
if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
- ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
+ (!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)))
ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second);
}
}
@@ -8080,7 +8079,6 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
bool VPRecipeBuilder::getScaledReductions(
Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range,
SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) {
-
if (!CM.TheLoop->contains(RdxExitInstr))
return false;
@@ -8109,43 +8107,71 @@ bool VPRecipeBuilder::getScaledReductions(
if (PhiOp != PHI)
return false;
- auto *BinOp = dyn_cast<BinaryOperator>(Op);
- if (!BinOp || !BinOp->hasOneUse())
- return false;
-
using namespace llvm::PatternMatch;
- // Use the side-effect of match to replace BinOp only if the pattern is
- // matched, we don't care at this point whether it actually matched.
- match(BinOp, m_Neg(m_BinOp(BinOp)));
- Value *A, *B;
- if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
- !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
- return false;
+ // If the update is a binary operator, check both of its operands to see if
+ // they are extends. Otherwise, see if the update comes directly from an
+ // extend.
+ Instruction *Exts[2] = {nullptr};
+ BinaryOperator *ExtendUser = dyn_cast<BinaryOperator>(Op);
+ std::optional<unsigned> BinOpc;
+ Type *ExtOpTypes[2] = {nullptr};
+
+ auto CollectExtInfo = [&Exts,
+ &ExtOpTypes](SmallVectorImpl<Value *> &Ops) -> bool {
+ unsigned I = 0;
+ for (Value *OpI : Ops) {
+ Value *ExtOp;
+ if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
+ return false;
+ Exts[I] = cast<Instruction>(OpI);
+ ExtOpTypes[I] = ExtOp->getType();
+ I++;
+ }
+ return true;
+ };
+
+ if (ExtendUser) {
+ if (!ExtendUser->hasOneUse())
+ return false;
- Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
- Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
+ // Use the side-effect of match to replace BinOp only if the pattern is
+ // matched, we don't care at this point whether it actually matched.
+ match(ExtendUser, m_Neg(m_BinOp(ExtendUser)));
+
+ SmallVector<Value *> Ops(ExtendUser->operands());
+ if (!CollectExtInfo(Ops))
+ return false;
+
+ BinOpc = std::make_optional(ExtendUser->getOpcode());
+ } else if (match(Update, m_Add(m_Value(), m_Value()))) {
+ // We already know the operands for Update are Op and PhiOp.
+ SmallVector<Value *> Ops({Op});
+ if (!CollectExtInfo(Ops))
+ return false;
+
+ ExtendUser = Update;
+ BinOpc = std::nullopt;
+ } else
+ return false;
TTI::PartialReductionExtendKind OpAExtend =
- TargetTransformInfo::getPartialReductionExtendKind(ExtA);
+ TTI::getPartialReductionExtendKind(Exts[0]);
TTI::PartialReductionExtendKind OpBExtend =
- TargetTransformInfo::getPartialReductionExtendKind(ExtB);
-
- PartialReductionChain Chain(RdxExitInstr, ExtA, ExtB, BinOp);
+ Exts[1] ? TTI::getPartialReductionExtendKind(Exts[1]) : TTI::PR_None;
+ PartialReductionChain Chain(RdxExitInstr, Exts[0], Exts[1], ExtendUser);
TypeSize PHISize = PHI->getType()->getPrimitiveSizeInBits();
- TypeSize ASize = A->getType()->getPrimitiveSizeInBits();
-
+ TypeSize ASize = ExtOpTypes[0]->getPrimitiveSizeInBits();
if (!PHISize.hasKnownScalarFactor(ASize))
return false;
-
unsigned TargetScaleFactor = PHISize.getKnownScalarFactor(ASize);
if (LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) {
InstructionCost Cost = TTI->getPartialReductionCost(
- Update->getOpcode(), A->getType(), B->getType(), PHI->getType(),
- VF, OpAExtend, OpBExtend, BinOp->getOpcode(), CM.CostKind);
+ Update->getOpcode(), ExtOpTypes[0], ExtOpTypes[1],
+ PHI->getType(), VF, OpAExtend, OpBExtend, BinOpc, CM.CostKind);
return Cost.isValid();
},
Range)) {
@@ -8165,10 +8191,10 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
SmallVector<VPValue *, 4> Operands(R->operands());
if (auto *PhiR = dyn_cast<VPWidenPHIRecipe>(R)) {
VPBasicBlock *Parent = PhiR->getParent();
- VPRegionBlock *LoopRegionOf = Parent->getEnclosingLoopRegion();
+ [[maybe_unused]] VPRegionBlock *LoopRegionOf =
+ Parent->getEnclosingLoopRegion();
assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent &&
"Non-header phis should have been handled during predication");
- (void)LoopRegionOf;
auto *Phi = cast<PHINode>(R->getUnderlyingInstr());
assert(Operands.size() == 2 && "Must have 2 operands for header phis");
if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
@@ -8953,8 +8979,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
RecurKind Kind = RdxDesc.getRecurrenceKind();
assert(
!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
- !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) &&
- "AnyOf and FindLast reductions are not allowed for in-loop reductions");
+ !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
+ "AnyOf and FindIV reductions are not allowed for in-loop reductions");
// Collect the chain of "link" recipes for the reduction starting at PhiR.
SetVector<VPSingleDefRecipe *> Worklist;
@@ -9112,34 +9138,12 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
cast<VPInstruction>(&U)->getOpcode() ==
VPInstruction::ComputeReductionResult ||
cast<VPInstruction>(&U)->getOpcode() ==
- VPInstruction::ComputeFindLastIVResult);
+ VPInstruction::ComputeFindIVResult);
});
if (CM.usePredicatedReductionSelect())
PhiR->setOperand(1, NewExitingVPV);
}
- // If the vector reduction can be performed in a smaller type, we truncate
- // then extend the loop exit value to enable InstCombine to evaluate the
- // entire expression in the smaller type.
- if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
- !RecurrenceDescriptor::isAnyOfRecurrenceKind(
- RdxDesc.getRecurrenceKind())) {
- assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
- Type *RdxTy = RdxDesc.getRecurrenceType();
- auto *Trunc =
- new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
- auto *Extnd =
- RdxDesc.isSigned()
- ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
- : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
-
- Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
- Extnd->insertAfter(Trunc);
- if (PhiR->getOperand(1) == NewExitingVPV)
- PhiR->setOperand(1, Extnd->getVPSingleValue());
- NewExitingVPV = Extnd;
- }
-
// We want code in the middle block to appear to execute on the location of
// the scalar loop's latch terminator because: (a) it is all compiler
// generated, (b) these instructions are always executed after evaluating
@@ -9156,12 +9160,12 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
VPInstruction *FinalReductionResult;
VPBuilder::InsertPointGuard Guard(Builder);
Builder.setInsertPoint(MiddleVPBB, IP);
- if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
+ if (RecurrenceDescriptor::isFindIVRecurrenceKind(
RdxDesc.getRecurrenceKind())) {
VPValue *Start = PhiR->getStartValue();
VPValue *Sentinel = Plan->getOrAddLiveIn(RdxDesc.getSentinelValue());
FinalReductionResult =
- Builder.createNaryOp(VPInstruction::ComputeFindLastIVResult,
+ Builder.createNaryOp(VPInstruction::ComputeFindIVResult,
{PhiR, Start, Sentinel, NewExitingVPV}, ExitDL);
} else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
RdxDesc.getRecurrenceKind())) {
@@ -9178,6 +9182,34 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
Builder.createNaryOp(VPInstruction::ComputeReductionResult,
{PhiR, NewExitingVPV}, Flags, ExitDL);
}
+ // If the vector reduction can be performed in a smaller type, we truncate
+ // then extend the loop exit value to enable InstCombine to evaluate the
+ // entire expression in the smaller type.
+ if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
+ !RecurrenceDescriptor::isAnyOfRecurrenceKind(
+ RdxDesc.getRecurrenceKind())) {
+ assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
+ assert(!RecurrenceDescriptor::isMinMaxRecurrenceKind(
+ RdxDesc.getRecurrenceKind()) &&
+ "Unexpected truncated min-max recurrence!");
+ Type *RdxTy = RdxDesc.getRecurrenceType();
+ auto *Trunc =
+ new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
+ Instruction::CastOps ExtendOpc =
+ RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
+ auto *Extnd = new VPWidenCastRecipe(ExtendOpc, Trunc, PhiTy);
+ Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
+ Extnd->insertAfter(Trunc);
+ if (PhiR->getOperand(1) == NewExitingVPV)
+ PhiR->setOperand(1, Extnd->getVPSingleValue());
+
+ // Update ComputeReductionResult with the truncated exiting value and
+ // extend its result.
+ FinalReductionResult->setOperand(1, Trunc);
+ FinalReductionResult =
+ Builder.createScalarCast(ExtendOpc, FinalReductionResult, PhiTy, {});
+ }
+
// Update all users outside the vector region. Also replace redundant
// ExtractLastElement.
for (auto *U : to_vector(OrigExitingVPV->users())) {
@@ -9224,16 +9256,16 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
continue;
}
- if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
+ if (RecurrenceDescriptor::isFindIVRecurrenceKind(
RdxDesc.getRecurrenceKind())) {
- // Adjust the start value for FindLastIV recurrences to use the sentinel
- // value after generating the ResumePhi recipe, which uses the original
- // start value.
+ // Adjust the start value for FindFirstIV/FindLastIV recurrences to use
+ // the sentinel value after generating the ResumePhi recipe, which uses
+ // the original start value.
PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
}
RecurKind RK = RdxDesc.getRecurrenceKind();
if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) &&
- !RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK) &&
+ !RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
!RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) {
VPBuilder PHBuilder(Plan->getVectorPreheader());
VPValue *Iden = Plan->getOrAddLiveIn(
@@ -9259,18 +9291,18 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
VPlanTransforms::runPass(VPlanTransforms::clearReductionWrapFlags, *Plan);
}
-void LoopVectorizationPlanner::addRuntimeChecks(
- VPlan &Plan, GeneratedRTChecks &RTChecks) const {
+void LoopVectorizationPlanner::attachRuntimeChecks(
+ VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
SmallVector<std::pair<VPValue *, VPIRBasicBlock *>> Checks;
- const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.emitSCEVChecks();
+ const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
if (SCEVCheckBlock) {
assert((!CM.OptForSize ||
CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
"Cannot SCEV check stride or overflow when optimizing for size");
- Checks.emplace_back(Plan.getOrAddLiveIn(SCEVCheckCond),
- Plan.createVPIRBasicBlock(SCEVCheckBlock));
+ VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
+ HasBranchWeights);
}
- const auto &[MemCheckCond, MemCheckBlock] = RTChecks.emitMemRuntimeChecks();
+ const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
if (MemCheckBlock) {
// VPlan-native path does not do any analysis for runtime checks
// currently.
@@ -9292,13 +9324,9 @@ void LoopVectorizationPlanner::addRuntimeChecks(
"(e.g., adding 'restrict').";
});
}
- Checks.emplace_back(Plan.getOrAddLiveIn(MemCheckCond),
- Plan.createVPIRBasicBlock(MemCheckBlock));
+ VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock,
+ HasBranchWeights);
}
-
- VPlanTransforms::connectCheckBlocks(
- Plan, Checks,
- hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()));
}
void VPDerivedIVRecipe::execute(VPTransformState &State) {
@@ -9655,18 +9683,18 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
VPlanTransforms::runPass(VPlanTransforms::removeDeadRecipes, MainPlan);
using namespace VPlanPatternMatch;
- // When vectorizing the epilogue, FindLastIV reductions can introduce multiple
- // uses of undef/poison. If the reduction start value may be undef or poison
- // it needs to be frozen and the frozen start has to be used when computing
- // the reduction result. We also need to use the frozen value in the resume
- // phi generated by the main vector loop, as this is also used to compute the
- // reduction result after the epilogue vector loop.
+ // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
+ // introduce multiple uses of undef/poison. If the reduction start value may
+ // be undef or poison it needs to be frozen and the frozen start has to be
+ // used when computing the reduction result. We also need to use the frozen
+ // value in the resume phi generated by the main vector loop, as this is also
+ // used to compute the reduction result after the epilogue vector loop.
auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
bool UpdateResumePhis) {
VPBuilder Builder(Plan.getEntry());
for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
auto *VPI = dyn_cast<VPInstruction>(&R);
- if (!VPI || VPI->getOpcode() != VPInstruction::ComputeFindLastIVResult)
+ if (!VPI || VPI->getOpcode() != VPInstruction::ComputeFindIVResult)
continue;
VPValue *OrigStart = VPI->getOperand(1);
if (isGuaranteedNotToBeUndefOrPoison(OrigStart->getLiveInIRValue()))
@@ -9761,7 +9789,7 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
return VPI &&
(VPI->getOpcode() == VPInstruction::ComputeAnyOfResult ||
VPI->getOpcode() == VPInstruction::ComputeReductionResult ||
- VPI->getOpcode() == VPInstruction::ComputeFindLastIVResult);
+ VPI->getOpcode() == VPInstruction::ComputeFindIVResult);
}));
ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
->getIncomingValueForBlock(L->getLoopPreheader());
@@ -9779,20 +9807,20 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
ResumeV = Builder.CreateICmpNE(ResumeV, StartV);
- } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
+ } else if (RecurrenceDescriptor::isFindIVRecurrenceKind(RK)) {
Value *StartV = getStartValueFromReductionResult(RdxResult);
assert(RdxDesc.getRecurrenceStartValue() == StartV &&
- "start value from ComputeFindLastIVResult must match");
+ "start value from ComputeFinIVResult must match");
ToFrozen[StartV] = cast<PHINode>(ResumeV)->getIncomingValueForBlock(
EPI.MainLoopIterationCountCheck);
- // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment
- // to the resume value. The resume value is adjusted to the sentinel
- // value when the final value from the main vector loop equals the start
- // value. This ensures correctness when the start value might not be
- // less than the minimum value of a monotonically increasing induction
- // variable.
+ // VPReductionPHIRecipe for FindFirstIV/FindLastIV reductions requires
+ // an adjustment to the resume value. The resume value is adjusted to
+ // the sentinel value when the final value from the main vector loop
+ // equals the start value. This ensures correctness when the start value
+ // might not be less than the minimum value of a monotonically
+ // increasing induction variable.
BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
@@ -10052,6 +10080,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Get user vectorization factor and interleave count.
ElementCount UserVF = Hints.getWidth();
unsigned UserIC = Hints.getInterleave();
+ if (LVL.hasUncountableEarlyExit() && UserIC != 1 &&
+ !VectorizerParams::isInterleaveForced()) {
+ UserIC = 1;
+ reportVectorizationInfo("Interleaving not supported for loops "
+ "with uncountable early exits",
+ "InterleaveEarlyExitDisabled", ORE, L);
+ }
// Plan how to best vectorize.
LVP.plan(UserVF, UserIC);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 3b629b42e2752..40a55656bfa7e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -488,9 +488,9 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
// Set each forward successor here when it is created, excluding
// backedges. A backward successor is set when the branch is created.
// Branches to VPIRBasicBlocks must have the same successors in VPlan as
- // in the original IR, except if the predecessors is the entry block. This
- // enables including SCEV and memory runtime check blocks in VPlan.
- // TODO: Remove exception by modeling branch in the entry block using
+ // in the original IR, except when the predecessor is the entry block.
+ // This enables including SCEV and memory runtime check blocks in VPlan.
+ // TODO: Remove exception by modeling the terminator of entry block using
// BranchOnCond.
unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
assert((TermBr && (!TermBr->getSuccessor(idx) ||
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 1f594260c4426..98cbc2054041d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -591,20 +591,21 @@ void VPlanTransforms::createLoopRegions(VPlan &Plan) {
TopRegion->getEntryBasicBlock()->setName("vector.body");
}
-// Likelyhood of bypassing the vectorized loop due to SCEV or memory runtime
-// checks.
+// Likelyhood of bypassing the vectorized loop due to a runtime check block,
+// including memory overlap checks block and wrapping/unit-stride checks block.
static constexpr uint32_t CheckBypassWeights[] = {1, 127};
-void VPlanTransforms::connectCheckBlocks(
- VPlan &Plan, ArrayRef<std::pair<VPValue *, VPIRBasicBlock *>> Checks,
- bool AddBranchWeights) {
+void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
+ BasicBlock *CheckBlock,
+ bool AddBranchWeights) {
+ VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
+ VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock);
VPBlockBase *VectorPH = Plan.getVectorPreheader();
VPBlockBase *ScalarPH = Plan.getScalarPreheader();
- for (const auto &[Cond, CheckBlock] : Checks) {
VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
- VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckBlock);
- VPBlockUtils::connectBlocks(CheckBlock, ScalarPH);
- CheckBlock->swapSuccessors();
+ VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckBlockVPBB);
+ VPBlockUtils::connectBlocks(CheckBlockVPBB, ScalarPH);
+ CheckBlockVPBB->swapSuccessors();
// We just connected a new block to the scalar preheader. Update all
// VPPhis by adding an incoming value for it, replicating the last value.
@@ -617,8 +618,8 @@ void VPlanTransforms::connectCheckBlocks(
}
VPIRMetadata VPBranchWeights;
- auto *Term = VPBuilder(CheckBlock)
- .createNaryOp(VPInstruction::BranchOnCond, {Cond},
+ auto *Term = VPBuilder(CheckBlockVPBB)
+ .createNaryOp(VPInstruction::BranchOnCond, {CondVPV},
Plan.getCanonicalIV()->getDebugLoc());
if (AddBranchWeights) {
MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext());
@@ -626,5 +627,4 @@ void VPlanTransforms::connectCheckBlocks(
MDB.createBranchWeights(CheckBypassWeights, /*IsExpected=*/false);
Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
}
- }
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 515f656d4be3c..7763bd810fb88 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -74,12 +74,11 @@ struct VPlanTransforms {
/// flat CFG into a hierarchical CFG.
static void createLoopRegions(VPlan &Plan);
- /// Connect the blocks in \p Checks to \p Plan, using the corresponding
- /// VPValue as branch condition.
- static void
- connectCheckBlocks(VPlan &Plan,
- ArrayRef<std::pair<VPValue *, VPIRBasicBlock *>> Checks,
- bool AddBranchWeights);
+ /// Wrap runtime check block \p CHeckBlock in a VPIRBB and \p Cond in a
+ /// VPValue and connect the block to \p Plan, using the VPValue as branch
+ /// condition.
+ static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock,
+ bool AddBranchWeights);
/// Replaces the VPInstructions in \p Plan with corresponding
/// widen recipes. Returns false if any VPInstructions could not be converted
@@ -245,7 +244,9 @@ struct VPlanTransforms {
/// Add branch weight metadata, if the \p Plan's middle block is terminated by
/// a BranchOnCond recipe.
- static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF);
+ static void
+ addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF,
+ std::optional<unsigned> VScaleForTuning);
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 002fae6827557..8ec38912af435 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -283,7 +283,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %1 = call i64 @llvm.vscale.i64()
; CHECK-NEXT: %2 = mul nuw i64 %1, 4
; CHECK-NEXT: %min.iters.check = icmp ult i64 %0, %2
-; CHECK-NEXT: br i1 %min.iters.check, label %scalar.ph, label %vector.scevcheck
+; CHECK-NEXT: br i1 %min.iters.check, label %scalar.ph, label %vector.ph
; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.scevcheck> in BB: vector.scevcheck
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: vector.scevcheck: ; No predecessors!
@@ -298,6 +298,9 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %8 = or i1 %7, %mul.overflow
; CHECK-NEXT: %9 = icmp ugt i64 %3, 4294967295
; CHECK-NEXT: %10 = or i1 %8, %9
+; CHECK-NEXT: br i1 %10, <null operand!>, <null operand!>
+; CHECK-NEXT: LV: draw edge from for.body.preheader
+; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.memcheck> in BB: vector.memcheck
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: vector.memcheck: ; No predecessors!
; CHECK-NEXT: %11 = call i64 @llvm.vscale.i64()
@@ -305,6 +308,15 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %13 = mul i64 %12, 4
; CHECK-NEXT: %14 = sub i64 %B1, %A2
; CHECK-NEXT: %diff.check = icmp ult i64 %14, %13
+; CHECK-NEXT: br i1 %diff.check, <null operand!>, <null operand!>
+; CHECK-NEXT: LV: draw edge from vector.scevcheck
+; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.ph> in BB: vector.ph
+; CHECK-NEXT: LV: filled BB:
+; CHECK-NEXT: vector.ph: ; No predecessors!
+; CHECK-NEXT: %15 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: %16 = mul nuw i64 %15, 4
+; CHECK-NEXT: %n.mod.vf = urem i64 %0, %16
+; CHECK-NEXT: %n.vec = sub i64 %0, %n.mod.vf
; CHECK-NEXT: %17 = call i64 @llvm.vscale.i64()
; CHECK-NEXT: %18 = mul nuw i64 %17, 4
; CHECK-NEXT: %19 = sub i64 %0, %n.vec
@@ -324,22 +336,24 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %22 = zext i32 %21 to i64
; CHECK-NEXT: %23 = getelementptr inbounds i32, ptr %B, i64 %22
; CHECK-NEXT: %24 = mul i64 0, %18
-; CHECK-NEXT: %25 = sub i64 1, %18
-; CHECK-NEXT: %26 = getelementptr inbounds i32, ptr %23, i64 %24
-; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %26, i64 %25
-; CHECK-NEXT: %wide.load = load <vscale x 4 x i32>, ptr %27, align 4
+; CHECK-NEXT: %25 = sub i64 %18, 1
+; CHECK-NEXT: %26 = mul i64 -1, %25
+; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %23, i64 %24
+; CHECK-NEXT: %28 = getelementptr inbounds i32, ptr %27, i64 %26
+; CHECK-NEXT: %wide.load = load <vscale x 4 x i32>, ptr %28, align 4
; CHECK-NEXT: %reverse = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %wide.load)
-; CHECK-NEXT: %28 = add <vscale x 4 x i32> %reverse, splat (i32 1)
-; CHECK-NEXT: %29 = getelementptr inbounds i32, ptr %A, i64 %22
-; CHECK-NEXT: %30 = mul i64 0, %18
-; CHECK-NEXT: %31 = sub i64 1, %18
-; CHECK-NEXT: %32 = getelementptr inbounds i32, ptr %29, i64 %30
-; CHECK-NEXT: %33 = getelementptr inbounds i32, ptr %32, i64 %31
-; CHECK-NEXT: %reverse4 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %28)
-; CHECK-NEXT: store <vscale x 4 x i32> %reverse4, ptr %33, align 4
+; CHECK-NEXT: %29 = add <vscale x 4 x i32> %reverse, splat (i32 1)
+; CHECK-NEXT: %30 = getelementptr inbounds i32, ptr %A, i64 %22
+; CHECK-NEXT: %31 = mul i64 0, %18
+; CHECK-NEXT: %32 = sub i64 %18, 1
+; CHECK-NEXT: %33 = mul i64 -1, %32
+; CHECK-NEXT: %34 = getelementptr inbounds i32, ptr %30, i64 %31
+; CHECK-NEXT: %35 = getelementptr inbounds i32, ptr %34, i64 %33
+; CHECK-NEXT: %reverse4 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %29)
+; CHECK-NEXT: store <vscale x 4 x i32> %reverse4, ptr %35, align 4
; CHECK-NEXT: %index.next = add nuw i64 %index, %18
-; CHECK-NEXT: %34 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT: br i1 %34, <null operand!>, label %vector.body
+; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT: br i1 %36, <null operand!>, label %vector.body
; CHECK-NEXT: LV: created middle.block
; CHECK-NEXT: LV: draw edge from vector.body
; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block
@@ -370,8 +384,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: %35 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT: %add9 = add i32 %35, 1
+; CHECK-NEXT: %37 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: %add9 = add i32 %37, 1
; CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
; CHECK-NEXT: store i32 %add9, ptr %arrayidx3, align 4
; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1
@@ -627,6 +641,9 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %n.mod.vf = urem i64 %0, %16
; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf
; CHECK-NEXT: IR %17 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: IR %18 = mul nuw i64 %17, 4
+; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
+; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
; CHECK-NEXT: Successor(s): vector.body
; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
@@ -679,6 +696,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %1 = call i64 @llvm.vscale.i64()
; CHECK-NEXT: %2 = mul nuw i64 %1, 4
; CHECK-NEXT: %min.iters.check = icmp ult i64 %0, %2
+; CHECK-NEXT: br i1 %min.iters.check, label %scalar.ph, label %vector.ph
+; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.scevcheck> in BB: vector.scevcheck
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: vector.scevcheck: ; No predecessors!
; CHECK-NEXT: %3 = add nsw i64 %0, -1
@@ -692,6 +711,19 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %8 = or i1 %7, %mul.overflow
; CHECK-NEXT: %9 = icmp ugt i64 %3, 4294967295
; CHECK-NEXT: %10 = or i1 %8, %9
+; CHECK-NEXT: br i1 %10, <null operand!>, <null operand!>
+; CHECK-NEXT: LV: draw edge from for.body.preheader
+; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.memcheck> in BB: vector.memcheck
+; CHECK-NEXT: LV: filled BB:
+; CHECK-NEXT: vector.memcheck: ; No predecessors!
+; CHECK-NEXT: %11 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: %12 = mul nuw i64 %11, 4
+; CHECK-NEXT: %13 = mul i64 %12, 4
+; CHECK-NEXT: %14 = sub i64 %B1, %A2
+; CHECK-NEXT: %diff.check = icmp ult i64 %14, %13
+; CHECK-NEXT: br i1 %diff.check, <null operand!>, <null operand!>
+; CHECK-NEXT: LV: draw edge from vector.scevcheck
+; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.ph> in BB: vector.ph
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: vector.ph: ; No predecessors!
; CHECK-NEXT: %15 = call i64 @llvm.vscale.i64()
@@ -717,22 +749,24 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %22 = zext i32 %21 to i64
; CHECK-NEXT: %23 = getelementptr inbounds float, ptr %B, i64 %22
; CHECK-NEXT: %24 = mul i64 0, %18
-; CHECK-NEXT: %25 = sub i64 1, %18
-; CHECK-NEXT: %26 = getelementptr inbounds float, ptr %23, i64 %24
-; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %26, i64 %25
-; CHECK-NEXT: %wide.load = load <vscale x 4 x float>, ptr %27, align 4
+; CHECK-NEXT: %25 = sub i64 %18, 1
+; CHECK-NEXT: %26 = mul i64 -1, %25
+; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %23, i64 %24
+; CHECK-NEXT: %28 = getelementptr inbounds float, ptr %27, i64 %26
+; CHECK-NEXT: %wide.load = load <vscale x 4 x float>, ptr %28, align 4
; CHECK-NEXT: %reverse = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %wide.load)
-; CHECK-NEXT: %28 = fadd <vscale x 4 x float> %reverse, splat (float 1.000000e+00)
-; CHECK-NEXT: %29 = getelementptr inbounds float, ptr %A, i64 %22
-; CHECK-NEXT: %30 = mul i64 0, %18
-; CHECK-NEXT: %31 = sub i64 1, %18
-; CHECK-NEXT: %32 = getelementptr inbounds float, ptr %29, i64 %30
-; CHECK-NEXT: %33 = getelementptr inbounds float, ptr %32, i64 %31
-; CHECK-NEXT: %reverse4 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %28)
-; CHECK-NEXT: store <vscale x 4 x float> %reverse4, ptr %33, align 4
+; CHECK-NEXT: %29 = fadd <vscale x 4 x float> %reverse, splat (float 1.000000e+00)
+; CHECK-NEXT: %30 = getelementptr inbounds float, ptr %A, i64 %22
+; CHECK-NEXT: %31 = mul i64 0, %18
+; CHECK-NEXT: %32 = sub i64 %18, 1
+; CHECK-NEXT: %33 = mul i64 -1, %32
+; CHECK-NEXT: %34 = getelementptr inbounds float, ptr %30, i64 %31
+; CHECK-NEXT: %35 = getelementptr inbounds float, ptr %34, i64 %33
+; CHECK-NEXT: %reverse4 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %29)
+; CHECK-NEXT: store <vscale x 4 x float> %reverse4, ptr %35, align 4
; CHECK-NEXT: %index.next = add nuw i64 %index, %18
-; CHECK-NEXT: %34 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT: br i1 %34, <null operand!>, label %vector.body
+; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT: br i1 %36, <null operand!>, label %vector.body
; CHECK-NEXT: LV: created middle.block
; CHECK-NEXT: LV: draw edge from vector.body
; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block
@@ -763,8 +797,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: %35 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT: %conv1 = fadd float %35, 1.000000e+00
+; CHECK-NEXT: %37 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT: %conv1 = fadd float %37, 1.000000e+00
; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
; CHECK-NEXT: store float %conv1, ptr %arrayidx3, align 4
; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1
>From 806a6e2305d0fa4130b8658e26b8d257cf904ca6 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 5 Jul 2025 22:08:45 +0100
Subject: [PATCH 4/5] !fixup fix formatting
---
.../Transforms/Vectorize/LoopVectorize.cpp | 2 +-
.../Vectorize/VPlanConstruction.cpp | 48 +++++++++----------
2 files changed, 25 insertions(+), 25 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 45f503f627ae9..9639525869815 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2010,7 +2010,7 @@ class GeneratedRTChecks {
return {nullptr, nullptr};
AddedAnyChecks = true;
- return {SCEVCheckCond , SCEVCheckBlock};
+ return {SCEVCheckCond, SCEVCheckBlock};
}
/// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 98cbc2054041d..37e15326e01f9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -602,29 +602,29 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock);
VPBlockBase *VectorPH = Plan.getVectorPreheader();
VPBlockBase *ScalarPH = Plan.getScalarPreheader();
- VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
- VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckBlockVPBB);
- VPBlockUtils::connectBlocks(CheckBlockVPBB, ScalarPH);
- CheckBlockVPBB->swapSuccessors();
-
- // We just connected a new block to the scalar preheader. Update all
- // VPPhis by adding an incoming value for it, replicating the last value.
- unsigned NumPredecessors = ScalarPH->getNumPredecessors();
- for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
- assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
- assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
- "must have incoming values for all operands");
- R.addOperand(R.getOperand(NumPredecessors - 2));
- }
+ VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
+ VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckBlockVPBB);
+ VPBlockUtils::connectBlocks(CheckBlockVPBB, ScalarPH);
+ CheckBlockVPBB->swapSuccessors();
+
+ // We just connected a new block to the scalar preheader. Update all
+ // VPPhis by adding an incoming value for it, replicating the last value.
+ unsigned NumPredecessors = ScalarPH->getNumPredecessors();
+ for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
+ assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
+ assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
+ "must have incoming values for all operands");
+ R.addOperand(R.getOperand(NumPredecessors - 2));
+ }
- VPIRMetadata VPBranchWeights;
- auto *Term = VPBuilder(CheckBlockVPBB)
- .createNaryOp(VPInstruction::BranchOnCond, {CondVPV},
- Plan.getCanonicalIV()->getDebugLoc());
- if (AddBranchWeights) {
- MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext());
- MDNode *BranchWeights =
- MDB.createBranchWeights(CheckBypassWeights, /*IsExpected=*/false);
- Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
- }
+ VPIRMetadata VPBranchWeights;
+ auto *Term = VPBuilder(CheckBlockVPBB)
+ .createNaryOp(VPInstruction::BranchOnCond, {CondVPV},
+ Plan.getCanonicalIV()->getDebugLoc());
+ if (AddBranchWeights) {
+ MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext());
+ MDNode *BranchWeights =
+ MDB.createBranchWeights(CheckBypassWeights, /*IsExpected=*/false);
+ Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
+ }
}
>From 1b8e1a0de52c927422200625c12158ecc99fa8e1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 9 Jul 2025 11:49:04 +0100
Subject: [PATCH 5/5] !fix address latest comments, thanks!
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 5 +----
llvm/lib/Transforms/Vectorize/VPlanTransforms.h | 2 +-
2 files changed, 2 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 59315cc742cad..992f98cec0010 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7509,9 +7509,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
- // Retrieve blocks with SCEV and memory runtime checks, if they have been
- // connected to the CFG, otherwise they are unused and will be deleted. Their
- // terminators and phis using them need adjusting below.
+ // Adjust the terminators of runtime check blocks and phis using them.
BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second;
BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second;
if (SCEVCheckBlock)
@@ -9285,7 +9283,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
void LoopVectorizationPlanner::attachRuntimeChecks(
VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
- SmallVector<std::pair<VPValue *, VPIRBasicBlock *>> Checks;
const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
if (SCEVCheckBlock) {
assert((!CM.OptForSize ||
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 7763bd810fb88..b42c444f09be8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -74,7 +74,7 @@ struct VPlanTransforms {
/// flat CFG into a hierarchical CFG.
static void createLoopRegions(VPlan &Plan);
- /// Wrap runtime check block \p CHeckBlock in a VPIRBB and \p Cond in a
+ /// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
/// VPValue and connect the block to \p Plan, using the VPValue as branch
/// condition.
static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock,
More information about the llvm-commits
mailing list