[llvm] [VPlan] Enable vectorization of early-exit loops with unit-stride fault-only-first loads (PR #151300)
Shih-Po Hung via llvm-commits
llvm-commits at lists.llvm.org
Sun Oct 26 23:46:14 PDT 2025
https://github.com/arcbbb updated https://github.com/llvm/llvm-project/pull/151300
>From bc2a814c51d11b9bf4c0ac9e918966d48a8bf1ff Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Fri, 25 Jul 2025 16:24:16 -0700
Subject: [PATCH 1/4] Support WidenFFLoad in early-exit loop
---
.../Transforms/Vectorize/LoopVectorize.cpp | 42 +++-
llvm/lib/Transforms/Vectorize/VPlan.h | 45 ++++
.../Transforms/Vectorize/VPlanAnalysis.cpp | 5 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 43 ++++
.../Transforms/Vectorize/VPlanTransforms.cpp | 97 +++++++
.../Transforms/Vectorize/VPlanTransforms.h | 11 +
llvm/lib/Transforms/Vectorize/VPlanValue.h | 3 +
.../Transforms/Vectorize/VPlanVerifier.cpp | 4 +-
.../Transforms/LoopVectorize/RISCV/find.ll | 236 ++++++++++++++++++
9 files changed, 481 insertions(+), 5 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/find.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index facb0fabdf57e..f53b7bd354b53 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -393,6 +393,12 @@ static cl::opt<bool> EnableEarlyExitVectorization(
cl::desc(
"Enable vectorization of early exit loops with uncountable exits."));
+static cl::opt<bool>
+ EnableEarlyExitWithFFLoads("enable-early-exit-with-ffload", cl::init(false),
+ cl::Hidden,
+ cl::desc("Enable vectorization of early-exit "
+ "loops with fault-only-first loads."));
+
static cl::opt<bool> ConsiderRegPressure(
"vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden,
cl::desc("Discard VFs if their register pressure is too high."));
@@ -3491,6 +3497,15 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return FixedScalableVFPair::getNone();
}
+ if (!Legal->getPotentiallyFaultingLoads().empty() && UserIC > 1) {
+ reportVectorizationFailure("Auto-vectorization of loops with potentially "
+ "faulting loads is not supported when the "
+ "interleave count is more than 1",
+ "CantInterleaveLoopWithPotentiallyFaultingLoads",
+ ORE, TheLoop);
+ return FixedScalableVFPair::getNone();
+ }
+
ScalarEvolution *SE = PSE.getSE();
ElementCount TC = getSmallConstantTripCount(SE, TheLoop);
unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
@@ -4061,6 +4076,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPReductionPHISC:
case VPDef::VPInterleaveEVLSC:
case VPDef::VPInterleaveSC:
+ case VPDef::VPWidenFFLoadSC:
case VPDef::VPWidenLoadEVLSC:
case VPDef::VPWidenLoadSC:
case VPDef::VPWidenStoreEVLSC:
@@ -4549,6 +4565,10 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
if (!Legal->isSafeForAnyVectorWidth())
return 1;
+ // No interleaving for potentially faulting loads.
+ if (!Legal->getPotentiallyFaultingLoads().empty())
+ return 1;
+
// We don't attempt to perform interleaving for loops with uncountable early
// exits because the VPInstruction::AnyOf code cannot currently handle
// multiple parts.
@@ -7253,6 +7273,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// Regions are dissolved after optimizing for VF and UF, which completely
// removes unneeded loop regions first.
VPlanTransforms::dissolveLoopRegions(BestVPlan);
+
+ VPlanTransforms::convertFFLoadEarlyExitToVLStepping(BestVPlan);
+
// Canonicalize EVL loops after regions are dissolved.
VPlanTransforms::canonicalizeEVLLoops(BestVPlan);
VPlanTransforms::materializeBackedgeTakenCount(BestVPlan, VectorPH);
@@ -7541,6 +7564,10 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
Builder.insert(VectorPtr);
Ptr = VectorPtr;
}
+ if (Legal->getPotentiallyFaultingLoads().contains(I))
+ return new VPWidenFFLoadRecipe(*cast<LoadInst>(I), Ptr, &Plan.getVF(), Mask,
+ VPIRMetadata(*I, LVer), I->getDebugLoc());
+
if (LoadInst *Load = dyn_cast<LoadInst>(I))
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
VPIRMetadata(*Load, LVer), I->getDebugLoc());
@@ -8378,6 +8405,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
if (Recipe->getNumDefinedValues() == 1) {
SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
Old2New[SingleDef] = Recipe->getVPSingleValue();
+ } else if (isa<VPWidenFFLoadRecipe>(Recipe)) {
+ VPValue *Data = Recipe->getVPValue(0);
+ SingleDef->replaceAllUsesWith(Data);
+ Old2New[SingleDef] = Data;
} else {
assert(Recipe->getNumDefinedValues() == 0 &&
"Unexpected multidef recipe");
@@ -8427,6 +8458,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// Adjust the recipes for any inloop reductions.
adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
+ VPlanTransforms::adjustFFLoadEarlyExitForPoisonSafety(*Plan);
+
// Apply mandatory transformation to handle FP maxnum/minnum reduction with
// NaNs if possible, bail out otherwise.
if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions,
@@ -9747,7 +9780,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
- if (!LVL.getPotentiallyFaultingLoads().empty()) {
+ if (EnableEarlyExitWithFFLoads) {
+ if (LVL.getPotentiallyFaultingLoads().size() > 1) {
+ reportVectorizationFailure("Auto-vectorization of loops with more than 1 "
+ "potentially faulting load is not enabled",
+ "MoreThanOnePotentiallyFaultingLoad", ORE, L);
+ return false;
+ }
+ } else if (!LVL.getPotentiallyFaultingLoads().empty()) {
reportVectorizationFailure("Auto-vectorization of loops with potentially "
"faulting load is not supported",
"PotentiallyFaultingLoadsNotSupported", ORE, L);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5b9f005e50e47..33eec80f97eff 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -559,6 +559,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInterleaveEVLSC:
case VPRecipeBase::VPInterleaveSC:
case VPRecipeBase::VPIRInstructionSC:
+ case VPRecipeBase::VPWidenFFLoadSC:
case VPRecipeBase::VPWidenLoadEVLSC:
case VPRecipeBase::VPWidenLoadSC:
case VPRecipeBase::VPWidenStoreEVLSC:
@@ -2836,6 +2837,13 @@ class LLVM_ABI_FOR_TEST VPReductionEVLRecipe : public VPReductionRecipe {
ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp,
R.isOrdered(), DL) {}
+ VPReductionEVLRecipe(RecurKind RdxKind, FastMathFlags FMFs, VPValue *ChainOp,
+ VPValue *VecOp, VPValue &EVL, VPValue *CondOp,
+ bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown())
+ : VPReductionRecipe(VPDef::VPReductionEVLSC, RdxKind, FMFs, nullptr,
+ ArrayRef<VPValue *>({ChainOp, VecOp, &EVL}), CondOp,
+ IsOrdered, DL) {}
+
~VPReductionEVLRecipe() override = default;
VPReductionEVLRecipe *clone() override {
@@ -3213,6 +3221,7 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
+ R->getVPDefID() == VPRecipeBase::VPWidenFFLoadSC ||
R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
}
@@ -3294,6 +3303,42 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
}
};
+/// A recipe for widening loads using fault-only-first intrinsics.
+/// Produces two results: (1) the loaded data, and (2) the index of the first
+/// non-dereferenceable lane, or VF if all lanes are successfully read.
+struct VPWidenFFLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
+ VPWidenFFLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *VF, VPValue *Mask,
+ const VPIRMetadata &Metadata, DebugLoc DL)
+ : VPWidenMemoryRecipe(VPDef::VPWidenFFLoadSC, Load, {Addr, VF},
+ /*Consecutive*/ true, /*Reverse*/ false, Metadata,
+ DL),
+ VPValue(this, &Load) {
+ new VPValue(nullptr, this); // Index of the first lane that faults.
+ setMask(Mask);
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenFFLoadSC);
+
+ /// Return the VF operand.
+ VPValue *getVF() const { return getOperand(1); }
+ void setVF(VPValue *V) { setOperand(1, V); }
+
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getVF() || Op == getAddr();
+ }
+};
+
/// A recipe for widening load operations with vector-predication intrinsics,
/// using the address to load from, the explicit vector length and an optional
/// mask.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 80a2e4bc3f754..1cd5e81377dc8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -190,8 +190,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
}
Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
- assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
- "Store recipes should not define any values");
+ assert(
+ (isa<VPWidenLoadRecipe, VPWidenFFLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
+ "Store recipes should not define any values");
return cast<LoadInst>(&R->getIngredient())->getType();
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 931a5b7582c4e..f84456c6ef15d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -74,6 +74,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPReductionPHISC:
case VPScalarIVStepsSC:
case VPPredInstPHISC:
+ case VPWidenFFLoadSC:
return false;
case VPBlendSC:
case VPReductionEVLSC:
@@ -108,6 +109,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
+ case VPWidenFFLoadSC:
return true;
case VPReplicateSC:
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
@@ -3616,6 +3618,47 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPWidenFFLoadRecipe::execute(VPTransformState &State) {
+ Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+ auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
+ const Align Alignment = getLoadStoreAlignment(&Ingredient);
+
+ auto &Builder = State.Builder;
+ State.setDebugLocFrom(getDebugLoc());
+
+ Value *VL = State.get(getVF(), VPLane(0));
+ Type *I32Ty = Builder.getInt32Ty();
+ VL = Builder.CreateZExtOrTrunc(VL, I32Ty);
+ Value *Addr = State.get(getAddr(), true);
+ Value *Mask = nullptr;
+ if (VPValue *VPMask = getMask())
+ Mask = State.get(VPMask);
+ else
+ Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+ CallInst *NewLI =
+ Builder.CreateIntrinsic(Intrinsic::vp_load_ff, {DataTy, Addr->getType()},
+ {Addr, Mask, VL}, nullptr, "vp.op.load.ff");
+ NewLI->addParamAttr(
+ 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
+ applyMetadata(*NewLI);
+ Value *V = cast<Instruction>(Builder.CreateExtractValue(NewLI, 0));
+ Value *NewVL = Builder.CreateExtractValue(NewLI, 1);
+ State.set(getVPValue(0), V);
+ State.set(getVPValue(1), NewVL, /*NeedsScalar=*/true);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenFFLoadRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN ";
+ printAsOperand(O, SlotTracker);
+ O << ", ";
+ getVPValue(1)->printAsOperand(O, SlotTracker);
+ O << " = vp.load.ff ";
+ printOperands(O, SlotTracker);
+}
+#endif
+
/// Use all-true mask for reverse rather than actual mask, as it avoids a
/// dependence w/o affecting the result.
static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 84817d78a077a..5f71b0df7cb01 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2832,6 +2832,103 @@ void VPlanTransforms::addExplicitVectorLength(
Plan.setUF(1);
}
+void VPlanTransforms::adjustFFLoadEarlyExitForPoisonSafety(VPlan &Plan) {
+ VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ VPWidenFFLoadRecipe *LastFFLoad = nullptr;
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_deep(Plan.getVectorLoopRegion())))
+ for (VPRecipeBase &R : *VPBB)
+ if (auto *Load = dyn_cast<VPWidenFFLoadRecipe>(&R)) {
+ assert(!LastFFLoad && "Only one FFLoad is supported");
+ LastFFLoad = Load;
+ }
+
+ // Skip if no FFLoad.
+ if (!LastFFLoad)
+ return;
+
+ // Ensure FFLoad does not read past the remainder in the last iteration.
+ // Set AVL to min(VF, remainder).
+ VPBuilder Builder(Header, Header->getFirstNonPhi());
+ VPValue *Remainder = Builder.createNaryOp(
+ Instruction::Sub, {&Plan.getVectorTripCount(), Plan.getCanonicalIV()});
+ VPValue *Cmp =
+ Builder.createICmp(CmpInst::ICMP_ULE, &Plan.getVF(), Remainder);
+ VPValue *AVL = Builder.createSelect(Cmp, &Plan.getVF(), Remainder);
+ LastFFLoad->setVF(AVL);
+
+ // To prevent branch-on-poison, rewrite the early-exit condition to
+ // VPReductionEVLRecipe. Expected pattern here is:
+ // EMIT vp<%alt.exit.cond> = AnyOf
+ // EMIT vp<%exit.cond> = or vp<%alt.exit.cond>, vp<%main.exit.cond>
+ // EMIT branch-on-cond vp<%exit.cond>
+ auto *ExitingLatch =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
+ auto *LatchExitingBr = cast<VPInstruction>(ExitingLatch->getTerminator());
+
+ VPValue *VPAnyOf = nullptr;
+ VPValue *VecOp = nullptr;
+ assert(
+ match(LatchExitingBr,
+ m_BranchOnCond(m_BinaryOr(m_VPValue(VPAnyOf), m_VPValue()))) &&
+ match(VPAnyOf, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(VecOp))) &&
+ "unexpected exiting sequence in early exit loop");
+
+ VPValue *OpVPEVLI32 = LastFFLoad->getVPValue(1);
+ VPValue *Mask = LastFFLoad->getMask();
+ FastMathFlags FMF;
+ auto *I1Ty = Type::getInt1Ty(Plan.getContext());
+ VPValue *VPZero = Plan.getOrAddLiveIn(ConstantInt::get(I1Ty, 0));
+ DebugLoc DL = VPAnyOf->getDefiningRecipe()->getDebugLoc();
+ auto *NewAnyOf =
+ new VPReductionEVLRecipe(RecurKind::Or, FMF, VPZero, VecOp, *OpVPEVLI32,
+ Mask, /*IsOrdered*/ false, DL);
+ NewAnyOf->insertBefore(VPAnyOf->getDefiningRecipe());
+ VPAnyOf->replaceAllUsesWith(NewAnyOf);
+
+ // Using FirstActiveLane in the early-exit block is safe,
+ // exiting conditions guarantees at least one valid lane precedes
+ // any poisoned lanes.
+}
+
+void VPlanTransforms::convertFFLoadEarlyExitToVLStepping(VPlan &Plan) {
+ // Find loop header by locating VPWidenFFLoadRecipe.
+ VPWidenFFLoadRecipe *LastFFLoad = nullptr;
+
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(Plan.getEntry())))
+ for (VPRecipeBase &R : *VPBB)
+ if (auto *Load = dyn_cast<VPWidenFFLoadRecipe>(&R)) {
+ assert(!LastFFLoad && "Only one FFLoad is supported");
+ LastFFLoad = Load;
+ }
+
+ // Skip if no FFLoad.
+ if (!LastFFLoad)
+ return;
+
+ VPBasicBlock *HeaderVPBB = LastFFLoad->getParent();
+ // Replace IVStep (VFxUF) with returned VL from FFLoad.
+ auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
+ VPValue *Backedge = CanonicalIV->getIncomingValue(1);
+ assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
+ m_Specific(&Plan.getVFxUF()))) &&
+ "Unexpected canonical iv");
+ VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
+ VPValue *OpVPEVLI32 = LastFFLoad->getVPValue(1);
+ VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
+ Builder.setInsertPoint(CanonicalIVIncrement);
+ auto *TC = Plan.getTripCount();
+ Type *CanIVTy = TC->isLiveIn()
+ ? TC->getLiveInIRValue()->getType()
+ : cast<VPExpandSCEVRecipe>(TC)->getSCEV()->getType();
+ auto *I32Ty = Type::getInt32Ty(Plan.getContext());
+ VPValue *OpVPEVL = Builder.createScalarZExtOrTrunc(
+ OpVPEVLI32, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
+
+ CanonicalIVIncrement->setOperand(1, OpVPEVL);
+}
+
void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
// Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
// There should be only one EVL PHI in the entire plan.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index b28559b620e13..6f69ed1ea7935 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -276,6 +276,17 @@ struct VPlanTransforms {
/// (branch-on-cond eq AVLNext, 0)
static void canonicalizeEVLLoops(VPlan &Plan);
+ /// Applies to early-exit loops that use FFLoad. FFLoad may yield fewer active
+ /// lanes than VF. To prevent branch-on-poison and over-reads past the vector
+ /// trip count, use the returned VL for both stepping and exit computation.
+ /// Implemented by:
+ /// - adjustFFLoadEarlyExitForPoisonSafety: replace AnyOf with vp.reduce.or
+ /// over the first VL lanes; set AVL = min(VF, remainder).
+ /// - convertFFLoadEarlyExitToVLStepping: after region dissolution, convert
+ /// early-exit loops to variable-length stepping.
+ static void adjustFFLoadEarlyExitForPoisonSafety(VPlan &Plan);
+ static void convertFFLoadEarlyExitToVLStepping(VPlan &Plan);
+
/// Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToConcreteRecipes(VPlan &Plan);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 83e3fcaaeee2b..bd5b36609276b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -35,6 +35,7 @@ class raw_ostream;
class Value;
class VPDef;
struct VPDoubleValueDef;
+struct VPWidenFFLoadRecipe;
class VPSlotTracker;
class VPUser;
class VPRecipeBase;
@@ -48,6 +49,7 @@ class VPPhiAccessors;
class LLVM_ABI_FOR_TEST VPValue {
friend class VPDef;
friend struct VPDoubleValueDef;
+ friend struct VPWidenFFLoadRecipe;
friend class VPInterleaveBase;
friend class VPlan;
friend class VPExpressionRecipe;
@@ -351,6 +353,7 @@ class VPDef {
VPWidenCastSC,
VPWidenGEPSC,
VPWidenIntrinsicSC,
+ VPWidenFFLoadSC,
VPWidenLoadEVLSC,
VPWidenLoadSC,
VPWidenStoreEVLSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 91734a10cb2c8..489fb5f956b9c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -167,8 +167,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
}
return VerifyEVLUse(*R, 2);
})
- .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe,
- VPInterleaveEVLRecipe>(
+ .Case<VPWidenLoadEVLRecipe, VPWidenFFLoadRecipe,
+ VPVectorEndPointerRecipe, VPInterleaveEVLRecipe>(
[&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); })
.Case<VPInstructionWithType>(
[&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); })
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/find.ll b/llvm/test/Transforms/LoopVectorize/RISCV/find.ll
new file mode 100644
index 0000000000000..f734bd5f53c82
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/find.ll
@@ -0,0 +1,236 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -enable-early-exit-with-ffload -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s
+
+define ptr @find_with_liveout(ptr %first, ptr %last, ptr %value) {
+; CHECK-LABEL: define ptr @find_with_liveout(
+; CHECK-SAME: ptr [[FIRST:%.*]], ptr [[LAST:%.*]], ptr [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[FIRST4:%.*]] = ptrtoint ptr [[FIRST]] to i64
+; CHECK-NEXT: [[LAST3:%.*]] = ptrtoint ptr [[LAST]] to i64
+; CHECK-NEXT: [[FIRST2:%.*]] = ptrtoint ptr [[FIRST]] to i64
+; CHECK-NEXT: [[LAST1:%.*]] = ptrtoint ptr [[LAST]] to i64
+; CHECK-NEXT: [[CMP_NOT6:%.*]] = icmp eq ptr [[FIRST]], [[LAST]]
+; CHECK-NEXT: br i1 [[CMP_NOT6]], label %[[RETURN:.*]], label %[[FOR_BODY_LR_PH:.*]]
+; CHECK: [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[VALUE]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[LAST3]], -4
+; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], [[FIRST4]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 2
+; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2
+; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP6]], i64 20)
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[UMAX]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK: [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[LAST1]] to i2
+; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[FIRST2]] to i2
+; CHECK-NEXT: [[TMP9:%.*]] = sub i2 [[TMP7]], [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = zext i2 [[TMP9]] to i64
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT: br i1 [[IDENT_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], [[TMP12]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 4
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP0]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[N_VEC]], [[INDEX]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ule i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP12]], i64 [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
+; CHECK-NEXT: [[VP_OP_LOAD_FF:%.*]] = call { <vscale x 4 x i32>, i32 } @llvm.vp.load.ff.nxv4i32.p0(ptr align 4 [[NEXT_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
+; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, i32 } [[VP_OP_LOAD_FF]], 0
+; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x i32>, i32 } [[VP_OP_LOAD_FF]], 1
+; CHECK-NEXT: [[TMP21:%.*]] = icmp eq <vscale x 4 x i32> [[TMP19]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
+; CHECK-NEXT: [[TMP24:%.*]] = or i1 [[TMP23]], false
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: [[TMP26:%.*]] = or i1 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_SPLIT]]:
+; CHECK-NEXT: br i1 [[TMP24]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[RETURN_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP21]], i1 true)
+; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], [[TMP27]]
+; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 4
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP29]]
+; CHECK-NEXT: br label %[[RETURN_LOOPEXIT]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ [[FIRST]], %[[FOR_BODY_LR_PH]] ], [ [[FIRST]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[FIRST_ADDR_07:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[FIRST_ADDR_07]], align 4
+; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP31]], [[TMP0]]
+; CHECK-NEXT: br i1 [[CMP1]], label %[[RETURN_LOOPEXIT]], label %[[FOR_INC]]
+; CHECK: [[FOR_INC]]:
+; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[FIRST_ADDR_07]], i64 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[LAST]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label %[[RETURN_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[RETURN_LOOPEXIT]]:
+; CHECK-NEXT: [[RETVAL_0_PH:%.*]] = phi ptr [ [[FIRST_ADDR_07]], %[[FOR_BODY]] ], [ [[LAST]], %[[FOR_INC]] ], [ [[LAST]], %[[MIDDLE_BLOCK]] ], [ [[TMP30]], %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT: br label %[[RETURN]]
+; CHECK: [[RETURN]]:
+; CHECK-NEXT: [[RETVAL_0:%.*]] = phi ptr [ [[FIRST]], %[[ENTRY]] ], [ [[RETVAL_0_PH]], %[[RETURN_LOOPEXIT]] ]
+; CHECK-NEXT: ret ptr [[RETVAL_0]]
+;
+entry:
+ %cmp.not6 = icmp eq ptr %first, %last
+ br i1 %cmp.not6, label %return, label %for.body.lr.ph
+
+for.body.lr.ph:
+ %0 = load i32, ptr %value, align 4
+ br label %for.body
+
+for.body:
+ %first.addr.07 = phi ptr [ %first, %for.body.lr.ph ], [ %incdec.ptr, %for.inc ]
+ %1 = load i32, ptr %first.addr.07, align 4
+ %cmp1 = icmp eq i32 %1, %0
+ br i1 %cmp1, label %return.loopexit, label %for.inc
+
+for.inc:
+ %incdec.ptr = getelementptr inbounds i32, ptr %first.addr.07, i64 1
+ %cmp.not = icmp eq ptr %incdec.ptr, %last
+ br i1 %cmp.not, label %return.loopexit, label %for.body
+
+return.loopexit:
+ %retval.0.ph = phi ptr [ %first.addr.07, %for.body ], [ %last, %for.inc ]
+ br label %return
+
+return:
+ %retval.0 = phi ptr [ %first, %entry ], [ %retval.0.ph, %return.loopexit ]
+ ret ptr %retval.0
+}
+
+define i32 @find_without_liveout(ptr %first, ptr %last, ptr %value) {
+; CHECK-LABEL: define i32 @find_without_liveout(
+; CHECK-SAME: ptr [[FIRST:%.*]], ptr [[LAST:%.*]], ptr [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[FIRST4:%.*]] = ptrtoint ptr [[FIRST]] to i64
+; CHECK-NEXT: [[LAST3:%.*]] = ptrtoint ptr [[LAST]] to i64
+; CHECK-NEXT: [[FIRST2:%.*]] = ptrtoint ptr [[FIRST]] to i64
+; CHECK-NEXT: [[LAST1:%.*]] = ptrtoint ptr [[LAST]] to i64
+; CHECK-NEXT: [[CMP_NOT6:%.*]] = icmp eq ptr [[FIRST]], [[LAST]]
+; CHECK-NEXT: br i1 [[CMP_NOT6]], label %[[RETURN:.*]], label %[[FOR_BODY_LR_PH:.*]]
+; CHECK: [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[VALUE]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[LAST3]], -4
+; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], [[FIRST4]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 2
+; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2
+; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP6]], i64 15)
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[UMAX]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK: [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[LAST1]] to i2
+; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[FIRST2]] to i2
+; CHECK-NEXT: [[TMP9:%.*]] = sub i2 [[TMP7]], [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = zext i2 [[TMP9]] to i64
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT: br i1 [[IDENT_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], [[TMP12]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 4
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP0]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[N_VEC]], [[INDEX]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ule i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP12]], i64 [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
+; CHECK-NEXT: [[VP_OP_LOAD_FF:%.*]] = call { <vscale x 4 x i32>, i32 } @llvm.vp.load.ff.nxv4i32.p0(ptr align 4 [[NEXT_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
+; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, i32 } [[VP_OP_LOAD_FF]], 0
+; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x i32>, i32 } [[VP_OP_LOAD_FF]], 1
+; CHECK-NEXT: [[TMP21:%.*]] = icmp eq <vscale x 4 x i32> [[TMP19]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
+; CHECK-NEXT: [[TMP24:%.*]] = or i1 [[TMP23]], false
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: [[TMP26:%.*]] = or i1 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_SPLIT]]:
+; CHECK-NEXT: br i1 [[TMP24]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[RETURN_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT: br label %[[RETURN_LOOPEXIT]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ [[FIRST]], %[[FOR_BODY_LR_PH]] ], [ [[FIRST]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[FIRST_ADDR_07:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[FIRST_ADDR_07]], align 4
+; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP27]], [[TMP0]]
+; CHECK-NEXT: br i1 [[CMP1]], label %[[RETURN_LOOPEXIT]], label %[[FOR_INC]]
+; CHECK: [[FOR_INC]]:
+; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[FIRST_ADDR_07]], i64 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[LAST]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label %[[RETURN_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: [[RETURN_LOOPEXIT]]:
+; CHECK-NEXT: [[RETVAL_0_PH:%.*]] = phi i32 [ 0, %[[FOR_BODY]] ], [ 1, %[[FOR_INC]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT: br label %[[RETURN]]
+; CHECK: [[RETURN]]:
+; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RETVAL_0_PH]], %[[RETURN_LOOPEXIT]] ]
+; CHECK-NEXT: ret i32 [[RETVAL_0]]
+;
+entry:
+ %cmp.not6 = icmp eq ptr %first, %last
+ br i1 %cmp.not6, label %return, label %for.body.lr.ph
+
+for.body.lr.ph:
+ %0 = load i32, ptr %value, align 4
+ br label %for.body
+
+for.body:
+ %first.addr.07 = phi ptr [ %first, %for.body.lr.ph ], [ %incdec.ptr, %for.inc ]
+ %1 = load i32, ptr %first.addr.07, align 4
+ %cmp1 = icmp eq i32 %1, %0
+ br i1 %cmp1, label %return.loopexit, label %for.inc
+
+for.inc:
+ %incdec.ptr = getelementptr inbounds i32, ptr %first.addr.07, i64 1
+ %cmp.not = icmp eq ptr %incdec.ptr, %last
+ br i1 %cmp.not, label %return.loopexit, label %for.body
+
+return.loopexit:
+ %retval.0.ph = phi i32 [ 0, %for.body ], [ 1, %for.inc ]
+ br label %return
+
+return:
+ %retval.0 = phi i32 [ 0, %entry ], [ %retval.0.ph, %return.loopexit ]
+ ret i32 %retval.0
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+;.
>From a3bf7f231a05fb5a6e0393f3f8861d2c173665a3 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Thu, 2 Oct 2025 19:41:06 -0700
Subject: [PATCH 2/4] Fix crash in release build
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 5f71b0df7cb01..c238a16c6ec15 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2868,11 +2868,12 @@ void VPlanTransforms::adjustFFLoadEarlyExitForPoisonSafety(VPlan &Plan) {
VPValue *VPAnyOf = nullptr;
VPValue *VecOp = nullptr;
- assert(
+ [[maybe_unused]] bool IsExitingOnAnyOfOr =
match(LatchExitingBr,
m_BranchOnCond(m_BinaryOr(m_VPValue(VPAnyOf), m_VPValue()))) &&
- match(VPAnyOf, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(VecOp))) &&
- "unexpected exiting sequence in early exit loop");
+ match(VPAnyOf, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(VecOp)));
+ assert(IsExitingOnAnyOfOr &&
+ "unexpected exiting sequence in early exit loop");
VPValue *OpVPEVLI32 = LastFFLoad->getVPValue(1);
VPValue *Mask = LastFFLoad->getMask();
>From 99ea3062fc3dd0788a3adf7f94478abd5787d5e3 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Fri, 17 Oct 2025 02:56:54 -0700
Subject: [PATCH 3/4] Address comments
1. Use (AnyOf (logical-and %aml, %cond)) instead of VPReductionEVLRecipe
2. Replace WidenFFLoadRecipe with WidenIntrinsicRecipe
3. Hanle StructType for WidenIntrinsicRecipe
---
llvm/include/llvm/Analysis/VectorUtils.h | 5 +
llvm/lib/Analysis/VectorUtils.cpp | 18 ++
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 21 ++
llvm/lib/Target/RISCV/RISCVISelLowering.h | 4 +
.../Target/RISCV/RISCVTargetTransformInfo.cpp | 11 +
.../Transforms/Vectorize/LoopVectorize.cpp | 47 ++-
.../Transforms/Vectorize/VPRecipeBuilder.h | 5 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 45 ---
.../Transforms/Vectorize/VPlanAnalysis.cpp | 11 +-
llvm/lib/Transforms/Vectorize/VPlanHelpers.h | 3 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 76 ++---
.../Transforms/Vectorize/VPlanTransforms.cpp | 112 ++++---
llvm/lib/Transforms/Vectorize/VPlanValue.h | 3 -
.../Transforms/Vectorize/VPlanVerifier.cpp | 4 +-
.../Transforms/LoopVectorize/RISCV/find.ll | 306 +++++++-----------
.../RISCV/vplan-vp-load-ff-intrinsics.ll | 45 +++
16 files changed, 376 insertions(+), 340 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-load-ff-intrinsics.ll
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index ce94906ee7c00..f0c064f5490a7 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -164,6 +164,11 @@ LLVM_ABI bool
isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx,
const TargetTransformInfo *TTI);
+/// Identifies if the vector form of the intrinsic that returns a struct has
+/// a scalar element at the struct element index \p RetIdx.
+LLVM_ABI bool isVectorIntrinsicWithStructReturnScalarAtField(Intrinsic::ID ID,
+ int RetIdx);
+
/// Identifies if the vector form of the intrinsic that returns a struct is
/// overloaded at the struct element index \p RetIdx. /// \p TTI is used to
/// consider target specific intrinsics, if no target specific intrinsics
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 091d94843698c..c6797dd06cc40 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -175,6 +175,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
return (ScalarOpdIdx == 2);
case Intrinsic::experimental_vp_splice:
return ScalarOpdIdx == 2 || ScalarOpdIdx == 4;
+ case Intrinsic::vp_load_ff:
+ return ScalarOpdIdx == 0 || ScalarOpdIdx == 2;
default:
return false;
}
@@ -212,11 +214,23 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
case Intrinsic::powi:
case Intrinsic::ldexp:
return OpdIdx == -1 || OpdIdx == 1;
+ case Intrinsic::vp_load_ff:
+ return OpdIdx == 0;
default:
return OpdIdx == -1;
}
}
+bool llvm::isVectorIntrinsicWithStructReturnScalarAtField(Intrinsic::ID ID,
+ int RetIdx) {
+ switch (ID) {
+ case Intrinsic::vp_load_ff:
+ return RetIdx == 1;
+ default:
+ return false;
+ }
+}
+
bool llvm::isVectorIntrinsicWithStructReturnOverloadAtField(
Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI) {
@@ -224,6 +238,10 @@ bool llvm::isVectorIntrinsicWithStructReturnOverloadAtField(
return TTI->isTargetIntrinsicWithStructReturnOverloadAtField(ID, RetIdx);
switch (ID) {
+ case Intrinsic::modf:
+ case Intrinsic::sincos:
+ case Intrinsic::sincospi:
+ return false;
case Intrinsic::frexp:
return RetIdx == 0 || RetIdx == 1;
default:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 1c930acd9c4a0..45d5e86602346 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -24909,6 +24909,27 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType,
if (!Subtarget.hasVInstructions())
return false;
+ // Only support fixed vectors if we know the minimum vector size.
+ if (DataType.isFixedLengthVector() &&
+ !Subtarget.useRVVForFixedLengthVectors())
+ return false;
+
+ EVT ScalarType = DataType.getScalarType();
+ if (!isLegalElementTypeForRVV(ScalarType))
+ return false;
+
+ if (!Subtarget.enableUnalignedVectorMem() &&
+ Alignment < ScalarType.getStoreSize())
+ return false;
+
+ return true;
+}
+
+bool RISCVTargetLowering::isLegalFirstFaultLoad(EVT DataType,
+ Align Alignment) const {
+ if (!Subtarget.hasVInstructions())
+ return false;
+
// Only support fixed vectors if we know the minimum vector size.
if (DataType.isFixedLengthVector() && !Subtarget.useRVVForFixedLengthVectors())
return false;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 9e3e2a9443625..2e39e3e9d3c51 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -426,6 +426,10 @@ class RISCVTargetLowering : public TargetLowering {
/// alignment is legal.
bool isLegalStridedLoadStore(EVT DataType, Align Alignment) const;
+ /// Return true if a fault-only-first load of the given result type and
+ /// alignment is legal.
+ bool isLegalFirstFaultLoad(EVT DataType, Align Alignment) const;
+
unsigned getMaxSupportedInterleaveFactor() const override { return 8; }
bool fallBackToDAGISel(const Instruction &Inst) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 7bc0b5b394828..f8d78d3da289b 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1589,6 +1589,17 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
CmpInst::FCMP_UNO, CostKind);
return Cost;
}
+ case Intrinsic::vp_load_ff: {
+ Type *DataTy = RetTy->getStructElementType(0);
+ EVT DataTypeVT = TLI->getValueType(DL, DataTy);
+ // TODO: Extend IntrinsicCostAttributes to accept Align parameter.
+ Align Alignment;
+ if (!TLI->isLegalFirstFaultLoad(DataTypeVT, Alignment))
+ return InstructionCost::getInvalid();
+
+ return getMemoryOpCost(Instruction::Load, DataTy, Alignment, 0, CostKind,
+ {TTI::OK_AnyValue, TTI::OP_None}, nullptr);
+ }
}
if (ST->hasVInstructions() && RetTy->isVectorTy()) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f53b7bd354b53..52f44f5763a64 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4076,7 +4076,6 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPReductionPHISC:
case VPDef::VPInterleaveEVLSC:
case VPDef::VPInterleaveSC:
- case VPDef::VPWidenFFLoadSC:
case VPDef::VPWidenLoadEVLSC:
case VPDef::VPWidenLoadSC:
case VPDef::VPWidenStoreEVLSC:
@@ -4116,7 +4115,23 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
if (!Visited.insert({ScalarTy}).second)
continue;
- Type *WideTy = toVectorizedTy(ScalarTy, VF);
+
+ Type *WideTy;
+ if (auto *WI = dyn_cast<VPWidenIntrinsicRecipe>(&R);
+ WI && ScalarTy->isStructTy()) {
+ auto *StructTy = cast<StructType>(ScalarTy);
+ SmallVector<Type *, 2> Tys;
+ for (unsigned I = 0, E = StructTy->getNumElements(); I != E; ++I) {
+ Type *ElementTy = StructTy->getStructElementType(I);
+ if (!isVectorIntrinsicWithStructReturnScalarAtField(
+ WI->getVectorIntrinsicID(), I))
+ ElementTy = toVectorizedTy(ElementTy, VF);
+ Tys.push_back(ElementTy);
+ }
+ WideTy = StructType::create(Tys);
+ } else
+ WideTy = toVectorizedTy(ScalarTy, VF);
+
if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
return true;
}
@@ -7504,9 +7519,9 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
});
}
-VPWidenMemoryRecipe *
-VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
- VFRange &Range) {
+VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
+ ArrayRef<VPValue *> Operands,
+ VFRange &Range) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Must be called with either a load or store");
@@ -7564,9 +7579,21 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
Builder.insert(VectorPtr);
Ptr = VectorPtr;
}
- if (Legal->getPotentiallyFaultingLoads().contains(I))
- return new VPWidenFFLoadRecipe(*cast<LoadInst>(I), Ptr, &Plan.getVF(), Mask,
- VPIRMetadata(*I, LVer), I->getDebugLoc());
+
+ if (Legal->getPotentiallyFaultingLoads().contains(I)) {
+ auto *I32Ty = IntegerType::getInt32Ty(Plan.getContext());
+ auto *RetTy = StructType::create({I->getType(), I32Ty});
+ DebugLoc DL = I->getDebugLoc();
+ if (!Mask)
+ Mask = Plan.getOrAddLiveIn(
+ ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext())));
+ auto *FFLoad = new VPWidenIntrinsicRecipe(
+ Intrinsic::vp_load_ff, {Ptr, Mask, &Plan.getVF()}, RetTy, DL);
+ Builder.insert(FFLoad);
+ VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, 0));
+ return new VPWidenRecipe(Instruction::ExtractValue, {FFLoad, Zero}, {}, {},
+ DL);
+ }
if (LoadInst *Load = dyn_cast<LoadInst>(I))
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
@@ -8405,10 +8432,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
if (Recipe->getNumDefinedValues() == 1) {
SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
Old2New[SingleDef] = Recipe->getVPSingleValue();
- } else if (isa<VPWidenFFLoadRecipe>(Recipe)) {
- VPValue *Data = Recipe->getVPValue(0);
- SingleDef->replaceAllUsesWith(Data);
- Old2New[SingleDef] = Data;
} else {
assert(Recipe->getNumDefinedValues() == 0 &&
"Unexpected multidef recipe");
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 41878e3c648e3..16eb8cffb47a0 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -96,9 +96,8 @@ class VPRecipeBuilder {
/// Check if the load or store instruction \p I should widened for \p
/// Range.Start and potentially masked. Such instructions are handled by a
/// recipe that takes an additional VPInstruction for the mask.
- VPWidenMemoryRecipe *tryToWidenMemory(Instruction *I,
- ArrayRef<VPValue *> Operands,
- VFRange &Range);
+ VPRecipeBase *tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
+ VFRange &Range);
/// Check if an induction recipe should be constructed for \p Phi. If so build
/// and return it. If not, return null.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 33eec80f97eff..5b9f005e50e47 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -559,7 +559,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInterleaveEVLSC:
case VPRecipeBase::VPInterleaveSC:
case VPRecipeBase::VPIRInstructionSC:
- case VPRecipeBase::VPWidenFFLoadSC:
case VPRecipeBase::VPWidenLoadEVLSC:
case VPRecipeBase::VPWidenLoadSC:
case VPRecipeBase::VPWidenStoreEVLSC:
@@ -2837,13 +2836,6 @@ class LLVM_ABI_FOR_TEST VPReductionEVLRecipe : public VPReductionRecipe {
ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp,
R.isOrdered(), DL) {}
- VPReductionEVLRecipe(RecurKind RdxKind, FastMathFlags FMFs, VPValue *ChainOp,
- VPValue *VecOp, VPValue &EVL, VPValue *CondOp,
- bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown())
- : VPReductionRecipe(VPDef::VPReductionEVLSC, RdxKind, FMFs, nullptr,
- ArrayRef<VPValue *>({ChainOp, VecOp, &EVL}), CondOp,
- IsOrdered, DL) {}
-
~VPReductionEVLRecipe() override = default;
VPReductionEVLRecipe *clone() override {
@@ -3221,7 +3213,6 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
- R->getVPDefID() == VPRecipeBase::VPWidenFFLoadSC ||
R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
}
@@ -3303,42 +3294,6 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
}
};
-/// A recipe for widening loads using fault-only-first intrinsics.
-/// Produces two results: (1) the loaded data, and (2) the index of the first
-/// non-dereferenceable lane, or VF if all lanes are successfully read.
-struct VPWidenFFLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
- VPWidenFFLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *VF, VPValue *Mask,
- const VPIRMetadata &Metadata, DebugLoc DL)
- : VPWidenMemoryRecipe(VPDef::VPWidenFFLoadSC, Load, {Addr, VF},
- /*Consecutive*/ true, /*Reverse*/ false, Metadata,
- DL),
- VPValue(this, &Load) {
- new VPValue(nullptr, this); // Index of the first lane that faults.
- setMask(Mask);
- }
-
- VP_CLASSOF_IMPL(VPDef::VPWidenFFLoadSC);
-
- /// Return the VF operand.
- VPValue *getVF() const { return getOperand(1); }
- void setVF(VPValue *V) { setOperand(1, V); }
-
- void execute(VPTransformState &State) override;
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-#endif
-
- /// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
- assert(is_contained(operands(), Op) &&
- "Op must be an operand of the recipe");
- return Op == getVF() || Op == getAddr();
- }
-};
-
/// A recipe for widening load operations with vector-predication intrinsics,
/// using the address to load from, the explicit vector length and an optional
/// mask.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 1cd5e81377dc8..8c7229c07aa3e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -137,6 +137,12 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::BranchOnCond:
case VPInstruction::BranchOnCount:
return Type::getVoidTy(Ctx);
+ case Instruction::ExtractValue: {
+ assert(R->getNumOperands() == 2 && "expected single level extractvalue");
+ auto *StructTy = cast<StructType>(inferScalarType(R->getOperand(0)));
+ auto *CI = cast<ConstantInt>(R->getOperand(1)->getLiveInIRValue());
+ return StructTy->getTypeAtIndex(CI->getZExtValue());
+ }
default:
break;
}
@@ -190,9 +196,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
}
Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
- assert(
- (isa<VPWidenLoadRecipe, VPWidenFFLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
- "Store recipes should not define any values");
+ assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
+ "Store recipes should not define any values");
return cast<LoadInst>(&R->getIngredient())->getType();
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 2aaabd9ebdd04..5234d85683cec 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -252,7 +252,8 @@ struct VPTransformState {
set(Def, V, VPLane(0));
return;
}
- assert((VF.isScalar() || isVectorizedTy(V->getType())) &&
+ assert((VF.isScalar() || isVectorizedTy(V->getType()) ||
+ V->getType()->isStructTy()) &&
"scalar values must be stored as (0, 0)");
Data.VPV2Vector[Def] = V;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f84456c6ef15d..fd65ef0fd9687 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -74,7 +74,6 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPReductionPHISC:
case VPScalarIVStepsSC:
case VPPredInstPHISC:
- case VPWidenFFLoadSC:
return false;
case VPBlendSC:
case VPReductionEVLSC:
@@ -109,7 +108,6 @@ bool VPRecipeBase::mayReadFromMemory() const {
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
- case VPWidenFFLoadSC:
return true;
case VPReplicateSC:
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
@@ -626,6 +624,12 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
return Builder.CreateNot(A, Name);
}
+ case Instruction::ExtractValue: {
+ assert(getNumOperands() == 2 && "expected single level extractvalue");
+ Value *Op = State.get(getOperand(0));
+ auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
+ return Builder.CreateExtractValue(Op, CI->getZExtValue());
+ }
case Instruction::ExtractElement: {
assert(State.VF.isVector() && "Only extract elements from vectors");
if (getOperand(1)->isLiveIn()) {
@@ -1196,6 +1200,7 @@ bool VPInstruction::isVectorToScalar() const {
bool VPInstruction::isSingleScalar() const {
switch (getOpcode()) {
case Instruction::PHI:
+ case Instruction::ExtractValue:
case VPInstruction::ExplicitVectorLength:
case VPInstruction::ResumeForEpilogue:
case VPInstruction::VScale:
@@ -1740,7 +1745,16 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
SmallVector<Type *, 2> TysForDecl;
// Add return type if intrinsic is overloaded on it.
- if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
+ if (ResultTy->isStructTy()) {
+ auto *StructTy = cast<StructType>(ResultTy);
+ for (unsigned I = 0, E = StructTy->getNumElements(); I != E; ++I) {
+ if (isVectorIntrinsicWithStructReturnOverloadAtField(VectorIntrinsicID, I,
+ State.TTI))
+ TysForDecl.push_back(
+ toVectorizedTy(StructTy->getStructElementType(I), State.VF));
+ }
+ } else if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1,
+ State.TTI))
TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
SmallVector<Value *, 4> Args;
for (const auto &I : enumerate(operands())) {
@@ -1804,8 +1818,19 @@ static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
Arguments.push_back(V);
}
- Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
- Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
+ Type *RetTy = Ctx.Types.inferScalarType(&R);
+ if (RetTy->isStructTy()) {
+ auto *StructTy = cast<StructType>(RetTy);
+ SmallVector<Type *> Tys;
+ for (unsigned I = 0, E = StructTy->getNumElements(); I != E; ++I) {
+ Type *ElementTy = StructTy->getStructElementType(I);
+ if (!isVectorIntrinsicWithStructReturnScalarAtField(ID, I))
+ ElementTy = toVectorizedTy(ElementTy, VF);
+ Tys.push_back(ElementTy);
+ }
+ RetTy = StructType::create(Tys);
+ } else if (VF.isVector())
+ RetTy = toVectorizedTy(RetTy, VF);
SmallVector<Type *> ParamTys;
for (const VPValue *Op : Operands) {
ParamTys.push_back(VF.isVector()
@@ -3618,47 +3643,6 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-void VPWidenFFLoadRecipe::execute(VPTransformState &State) {
- Type *ScalarDataTy = getLoadStoreType(&Ingredient);
- auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
- const Align Alignment = getLoadStoreAlignment(&Ingredient);
-
- auto &Builder = State.Builder;
- State.setDebugLocFrom(getDebugLoc());
-
- Value *VL = State.get(getVF(), VPLane(0));
- Type *I32Ty = Builder.getInt32Ty();
- VL = Builder.CreateZExtOrTrunc(VL, I32Ty);
- Value *Addr = State.get(getAddr(), true);
- Value *Mask = nullptr;
- if (VPValue *VPMask = getMask())
- Mask = State.get(VPMask);
- else
- Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
- CallInst *NewLI =
- Builder.CreateIntrinsic(Intrinsic::vp_load_ff, {DataTy, Addr->getType()},
- {Addr, Mask, VL}, nullptr, "vp.op.load.ff");
- NewLI->addParamAttr(
- 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
- applyMetadata(*NewLI);
- Value *V = cast<Instruction>(Builder.CreateExtractValue(NewLI, 0));
- Value *NewVL = Builder.CreateExtractValue(NewLI, 1);
- State.set(getVPValue(0), V);
- State.set(getVPValue(1), NewVL, /*NeedsScalar=*/true);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenFFLoadRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "WIDEN ";
- printAsOperand(O, SlotTracker);
- O << ", ";
- getVPValue(1)->printAsOperand(O, SlotTracker);
- O << " = vp.load.ff ";
- printOperands(O, SlotTracker);
-}
-#endif
-
/// Use all-true mask for reverse rather than actual mask, as it avoids a
/// dependence w/o affecting the result.
static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c238a16c6ec15..b939f82bf6b1a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2833,14 +2833,16 @@ void VPlanTransforms::addExplicitVectorLength(
}
void VPlanTransforms::adjustFFLoadEarlyExitForPoisonSafety(VPlan &Plan) {
+ using namespace SCEVPatternMatch;
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
- VPWidenFFLoadRecipe *LastFFLoad = nullptr;
+ VPWidenIntrinsicRecipe *LastFFLoad = nullptr;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getVectorLoopRegion())))
for (VPRecipeBase &R : *VPBB)
- if (auto *Load = dyn_cast<VPWidenFFLoadRecipe>(&R)) {
+ if (match(&R, m_Intrinsic<Intrinsic::vp_load_ff>(m_VPValue(), m_VPValue(),
+ m_VPValue()))) {
assert(!LastFFLoad && "Only one FFLoad is supported");
- LastFFLoad = Load;
+ LastFFLoad = cast<VPWidenIntrinsicRecipe>(&R);
}
// Skip if no FFLoad.
@@ -2850,16 +2852,29 @@ void VPlanTransforms::adjustFFLoadEarlyExitForPoisonSafety(VPlan &Plan) {
// Ensure FFLoad does not read past the remainder in the last iteration.
// Set AVL to min(VF, remainder).
VPBuilder Builder(Header, Header->getFirstNonPhi());
+ DebugLoc DL = LastFFLoad->getDebugLoc();
VPValue *Remainder = Builder.createNaryOp(
- Instruction::Sub, {&Plan.getVectorTripCount(), Plan.getCanonicalIV()});
+ Instruction::Sub, {&Plan.getVectorTripCount(), Plan.getCanonicalIV()},
+ DL);
VPValue *Cmp =
- Builder.createICmp(CmpInst::ICMP_ULE, &Plan.getVF(), Remainder);
- VPValue *AVL = Builder.createSelect(Cmp, &Plan.getVF(), Remainder);
- LastFFLoad->setVF(AVL);
-
- // To prevent branch-on-poison, rewrite the early-exit condition to
- // VPReductionEVLRecipe. Expected pattern here is:
- // EMIT vp<%alt.exit.cond> = AnyOf
+ Builder.createICmp(CmpInst::ICMP_ULE, &Plan.getVF(), Remainder, DL);
+ VPValue *AVL = Builder.createSelect(Cmp, &Plan.getVF(), Remainder, DL);
+ Type *CanIVTy = Plan.getCanonicalIV()->getScalarType();
+ Type *I32Ty = IntegerType::getInt32Ty(Plan.getContext());
+ AVL = Builder.createScalarZExtOrTrunc(AVL, I32Ty, CanIVTy, DL);
+ LastFFLoad->setOperand(2, AVL);
+
+ // To prevent branch-on-poison, mask the early-exit condition with
+ // active-lane-mask. Expected pattern here is:
+ // Before:
+ // EMIT vp<%alt.exit.cond> = any-of vp<%cond>
+ // EMIT vp<%exit.cond> = or vp<%alt.exit.cond>, vp<%main.exit.cond>
+ // EMIT branch-on-cond vp<%exit.cond>
+ // After:
+ // EMIT vp<%faulting.lane> = extractvalue vp<%ffload>, 1
+ // EMIT vp<%alm> = active lane mask 0, vp<%faulting.lane>
+ // EMIT vp<%and> = logical-and vp<%alm>, vp<%cond>
+ // EMIT vp<%alt.exit.cond> = any-of vp<%and>
// EMIT vp<%exit.cond> = or vp<%alt.exit.cond>, vp<%main.exit.cond>
// EMIT branch-on-cond vp<%exit.cond>
auto *ExitingLatch =
@@ -2875,17 +2890,22 @@ void VPlanTransforms::adjustFFLoadEarlyExitForPoisonSafety(VPlan &Plan) {
assert(IsExitingOnAnyOfOr &&
"unexpected exiting sequence in early exit loop");
- VPValue *OpVPEVLI32 = LastFFLoad->getVPValue(1);
- VPValue *Mask = LastFFLoad->getMask();
- FastMathFlags FMF;
- auto *I1Ty = Type::getInt1Ty(Plan.getContext());
- VPValue *VPZero = Plan.getOrAddLiveIn(ConstantInt::get(I1Ty, 0));
- DebugLoc DL = VPAnyOf->getDefiningRecipe()->getDebugLoc();
- auto *NewAnyOf =
- new VPReductionEVLRecipe(RecurKind::Or, FMF, VPZero, VecOp, *OpVPEVLI32,
- Mask, /*IsOrdered*/ false, DL);
- NewAnyOf->insertBefore(VPAnyOf->getDefiningRecipe());
- VPAnyOf->replaceAllUsesWith(NewAnyOf);
+ // Creates the VPValue for the index of the faulting lane.
+ VPRecipeBase *AnyOfR = VPAnyOf->getDefiningRecipe();
+ Builder.setInsertPoint(cast<VPRecipeBase>(*LastFFLoad->user_begin()));
+ VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, 1));
+ VPValue *FaultingLane =
+ Builder.createNaryOp(Instruction::ExtractValue, {LastFFLoad, One}, DL);
+ FaultingLane =
+ Builder.createScalarZExtOrTrunc(FaultingLane, CanIVTy, I32Ty, DL);
+ VPValue *ALMMultiplier = Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, 1));
+ Builder.setInsertPoint(AnyOfR);
+ DL = AnyOfR->getDebugLoc();
+ auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, 0));
+ auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
+ {Zero, FaultingLane, ALMMultiplier}, DL);
+ auto *R = Builder.createNaryOp(VPInstruction::LogicalAnd, {ALM, VecOp}, DL);
+ AnyOfR->setOperand(0, R);
// Using FirstActiveLane in the early-exit block is safe,
// exiting conditions guarantees at least one valid lane precedes
@@ -2893,15 +2913,16 @@ void VPlanTransforms::adjustFFLoadEarlyExitForPoisonSafety(VPlan &Plan) {
}
void VPlanTransforms::convertFFLoadEarlyExitToVLStepping(VPlan &Plan) {
- // Find loop header by locating VPWidenFFLoadRecipe.
- VPWidenFFLoadRecipe *LastFFLoad = nullptr;
-
+ using namespace SCEVPatternMatch;
+ // Find loop header by locating FFLoad.
+ VPWidenIntrinsicRecipe *LastFFLoad = nullptr;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getEntry())))
for (VPRecipeBase &R : *VPBB)
- if (auto *Load = dyn_cast<VPWidenFFLoadRecipe>(&R)) {
+ if (match(&R, m_Intrinsic<Intrinsic::vp_load_ff>(m_VPValue(), m_VPValue(),
+ m_VPValue()))) {
assert(!LastFFLoad && "Only one FFLoad is supported");
- LastFFLoad = Load;
+ LastFFLoad = cast<VPWidenIntrinsicRecipe>(&R);
}
// Skip if no FFLoad.
@@ -2909,25 +2930,36 @@ void VPlanTransforms::convertFFLoadEarlyExitToVLStepping(VPlan &Plan) {
return;
VPBasicBlock *HeaderVPBB = LastFFLoad->getParent();
- // Replace IVStep (VFxUF) with returned VL from FFLoad.
+ // Replace IVStep (VFxUF) with returned faultnig lane from FFLoad.
auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
VPValue *Backedge = CanonicalIV->getIncomingValue(1);
assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
m_Specific(&Plan.getVFxUF()))) &&
"Unexpected canonical iv");
VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
- VPValue *OpVPEVLI32 = LastFFLoad->getVPValue(1);
- VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
- Builder.setInsertPoint(CanonicalIVIncrement);
- auto *TC = Plan.getTripCount();
- Type *CanIVTy = TC->isLiveIn()
- ? TC->getLiveInIRValue()->getType()
- : cast<VPExpandSCEVRecipe>(TC)->getSCEV()->getType();
- auto *I32Ty = Type::getInt32Ty(Plan.getContext());
- VPValue *OpVPEVL = Builder.createScalarZExtOrTrunc(
- OpVPEVLI32, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
-
- CanonicalIVIncrement->setOperand(1, OpVPEVL);
+ // Expected pattern
+ // EMIT vp<%alm> = active lane mask 0, vp<%faulting.lane>
+ // EMIT vp<%and> = logical-and vp<%alm>, vp<%cond>
+ // EMIT vp<%alt.exit.cond> = any-of vp<%and>
+ // EMIT vp<%exit.cond> = or vp<%alt.exit.cond>, vp<%main.exit.cond>
+ // EMIT branch-on-cond vp<%exit.cond>
+ // use the index to step the iv
+ VPBasicBlock *LatchExiting =
+ HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
+ auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
+ VPValue *VPAnyOf = nullptr;
+ VPValue *FaultingLane = nullptr;
+ [[maybe_unused]] bool IsExitingOnAnyOfOr =
+ match(LatchExitingBr,
+ m_BranchOnCond(m_BinaryOr(m_VPValue(VPAnyOf), m_VPValue()))) &&
+ match(VPAnyOf,
+ m_VPInstruction<VPInstruction::AnyOf>(
+ m_VPInstruction<VPInstruction::LogicalAnd>(
+ m_VPInstruction<VPInstruction::ActiveLaneMask>(
+ m_ZeroInt(), m_VPValue(FaultingLane), m_VPValue()),
+ m_VPValue())));
+
+ CanonicalIVIncrement->setOperand(1, FaultingLane);
}
void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index bd5b36609276b..83e3fcaaeee2b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -35,7 +35,6 @@ class raw_ostream;
class Value;
class VPDef;
struct VPDoubleValueDef;
-struct VPWidenFFLoadRecipe;
class VPSlotTracker;
class VPUser;
class VPRecipeBase;
@@ -49,7 +48,6 @@ class VPPhiAccessors;
class LLVM_ABI_FOR_TEST VPValue {
friend class VPDef;
friend struct VPDoubleValueDef;
- friend struct VPWidenFFLoadRecipe;
friend class VPInterleaveBase;
friend class VPlan;
friend class VPExpressionRecipe;
@@ -353,7 +351,6 @@ class VPDef {
VPWidenCastSC,
VPWidenGEPSC,
VPWidenIntrinsicSC,
- VPWidenFFLoadSC,
VPWidenLoadEVLSC,
VPWidenLoadSC,
VPWidenStoreEVLSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 489fb5f956b9c..91734a10cb2c8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -167,8 +167,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
}
return VerifyEVLUse(*R, 2);
})
- .Case<VPWidenLoadEVLRecipe, VPWidenFFLoadRecipe,
- VPVectorEndPointerRecipe, VPInterleaveEVLRecipe>(
+ .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe,
+ VPInterleaveEVLRecipe>(
[&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); })
.Case<VPInstructionWithType>(
[&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); })
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/find.ll b/llvm/test/Transforms/LoopVectorize/RISCV/find.ll
index f734bd5f53c82..a82c61db2bcba 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/find.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/find.ll
@@ -1,236 +1,172 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=loop-vectorize -enable-early-exit-with-ffload -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s
+; RUN: opt -passes=loop-vectorize -enable-early-exit-with-ffload -mtriple=riscv64 -mattr=+v -S %s 2>&1 | FileCheck %s
-define ptr @find_with_liveout(ptr %first, ptr %last, ptr %value) {
-; CHECK-LABEL: define ptr @find_with_liveout(
-; CHECK-SAME: ptr [[FIRST:%.*]], ptr [[LAST:%.*]], ptr [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+define i64 @find_with_liveout(ptr %first, i8 %value) {
+; CHECK-LABEL: define i64 @find_with_liveout(
+; CHECK-SAME: ptr [[FIRST:%.*]], i8 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: [[FIRST4:%.*]] = ptrtoint ptr [[FIRST]] to i64
-; CHECK-NEXT: [[LAST3:%.*]] = ptrtoint ptr [[LAST]] to i64
-; CHECK-NEXT: [[FIRST2:%.*]] = ptrtoint ptr [[FIRST]] to i64
-; CHECK-NEXT: [[LAST1:%.*]] = ptrtoint ptr [[LAST]] to i64
-; CHECK-NEXT: [[CMP_NOT6:%.*]] = icmp eq ptr [[FIRST]], [[LAST]]
-; CHECK-NEXT: br i1 [[CMP_NOT6]], label %[[RETURN:.*]], label %[[FOR_BODY_LR_PH:.*]]
-; CHECK: [[FOR_BODY_LR_PH]]:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[VALUE]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[LAST3]], -4
-; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], [[FIRST4]]
-; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 2
-; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2
-; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP6]], i64 20)
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[UMAX]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
-; CHECK: [[VECTOR_SCEVCHECK]]:
-; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[LAST1]] to i2
-; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[FIRST2]] to i2
-; CHECK-NEXT: [[TMP9:%.*]] = sub i2 [[TMP7]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = zext i2 [[TMP9]] to i64
-; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP10]], 0
-; CHECK-NEXT: br i1 [[IDENT_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], [[TMP12]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP0]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[VALUE]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[N_VEC]], [[INDEX]]
-; CHECK-NEXT: [[TMP16:%.*]] = icmp ule i64 [[TMP12]], [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP12]], i64 [[TMP15]]
-; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
-; CHECK-NEXT: [[VP_OP_LOAD_FF:%.*]] = call { <vscale x 4 x i32>, i32 } @llvm.vp.load.ff.nxv4i32.p0(ptr align 4 [[NEXT_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
-; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, i32 } [[VP_OP_LOAD_FF]], 0
-; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x i32>, i32 } [[VP_OP_LOAD_FF]], 1
-; CHECK-NEXT: [[TMP21:%.*]] = icmp eq <vscale x 4 x i32> [[TMP19]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]]
-; CHECK-NEXT: [[TMP23:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
-; CHECK-NEXT: [[TMP24:%.*]] = or i1 [[TMP23]], false
-; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: [[TMP26:%.*]] = or i1 [[TMP24]], [[TMP25]]
-; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[N_VEC]], [[IV]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ule i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP3]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; CHECK-NEXT: [[FIRST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[FIRST]], i64 [[IV]]
+; CHECK-NEXT: [[TMP9:%.*]] = call { <vscale x 16 x i8>, i32 } @llvm.vp.load.ff.nxv16i8.p0(ptr [[FIRST_ADDR]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, i32 } [[TMP9]], 1
+; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, i32 } [[TMP9]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <vscale x 16 x i8> [[TMP12]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP11]])
+; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 16 x i1> [[TMP14]], <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP16:%.*]] = freeze <vscale x 16 x i1> [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP16]])
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_SPLIT]]:
-; CHECK-NEXT: br i1 [[TMP24]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK-NEXT: br i1 [[TMP17]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label %[[RETURN_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP21]], i1 true)
-; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], [[TMP27]]
-; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 4
-; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP29]]
-; CHECK-NEXT: br label %[[RETURN_LOOPEXIT]]
+; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP13]], i1 true)
+; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[IV]], [[TMP20]]
+; CHECK-NEXT: br label %[[EXIT]]
; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ [[FIRST]], %[[FOR_BODY_LR_PH]] ], [ [[FIRST]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[FIRST_ADDR_07:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], %[[FOR_INC:.*]] ]
-; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[FIRST_ADDR_07]], align 4
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP31]], [[TMP0]]
-; CHECK-NEXT: br i1 [[CMP1]], label %[[RETURN_LOOPEXIT]], label %[[FOR_INC]]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[FIRST]], i64 [[IV1]]
+; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ADDR]], align 1
+; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP29]], [[VALUE]]
+; CHECK-NEXT: br i1 [[CMP1]], label %[[EXIT]], label %[[FOR_INC]]
; CHECK: [[FOR_INC]]:
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[FIRST_ADDR_07]], i64 1
-; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[LAST]]
-; CHECK-NEXT: br i1 [[CMP_NOT]], label %[[RETURN_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: [[RETURN_LOOPEXIT]]:
-; CHECK-NEXT: [[RETVAL_0_PH:%.*]] = phi ptr [ [[FIRST_ADDR_07]], %[[FOR_BODY]] ], [ [[LAST]], %[[FOR_INC]] ], [ [[LAST]], %[[MIDDLE_BLOCK]] ], [ [[TMP30]], %[[VECTOR_EARLY_EXIT]] ]
-; CHECK-NEXT: br label %[[RETURN]]
-; CHECK: [[RETURN]]:
-; CHECK-NEXT: [[RETVAL_0:%.*]] = phi ptr [ [[FIRST]], %[[ENTRY]] ], [ [[RETVAL_0_PH]], %[[RETURN_LOOPEXIT]] ]
-; CHECK-NEXT: ret ptr [[RETVAL_0]]
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-NEXT: br i1 [[CMP_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[IV1]], %[[FOR_BODY]] ], [ 1024, %[[FOR_INC]] ], [ 1024, %[[MIDDLE_BLOCK]] ], [ [[TMP21]], %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
- %cmp.not6 = icmp eq ptr %first, %last
- br i1 %cmp.not6, label %return, label %for.body.lr.ph
-
-for.body.lr.ph:
- %0 = load i32, ptr %value, align 4
br label %for.body
for.body:
- %first.addr.07 = phi ptr [ %first, %for.body.lr.ph ], [ %incdec.ptr, %for.inc ]
- %1 = load i32, ptr %first.addr.07, align 4
- %cmp1 = icmp eq i32 %1, %0
- br i1 %cmp1, label %return.loopexit, label %for.inc
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+ %addr = getelementptr inbounds i8, ptr %first, i64 %iv
+ %1 = load i8, ptr %addr, align 1
+ %cmp1 = icmp eq i8 %1, %value
+ br i1 %cmp1, label %exit, label %for.inc
for.inc:
- %incdec.ptr = getelementptr inbounds i32, ptr %first.addr.07, i64 1
- %cmp.not = icmp eq ptr %incdec.ptr, %last
- br i1 %cmp.not, label %return.loopexit, label %for.body
-
-return.loopexit:
- %retval.0.ph = phi ptr [ %first.addr.07, %for.body ], [ %last, %for.inc ]
- br label %return
+ %iv.next = add i64 %iv, 1
+ %cmp.not = icmp eq i64 %iv.next, 1024
+ br i1 %cmp.not, label %exit, label %for.body
-return:
- %retval.0 = phi ptr [ %first, %entry ], [ %retval.0.ph, %return.loopexit ]
- ret ptr %retval.0
+exit:
+ %retval = phi i64 [ %iv, %for.body ], [ 1024, %for.inc ]
+ ret i64 %retval
}
-define i32 @find_without_liveout(ptr %first, ptr %last, ptr %value) {
+define i32 @find_without_liveout(ptr %first, i8 %value) {
; CHECK-LABEL: define i32 @find_without_liveout(
-; CHECK-SAME: ptr [[FIRST:%.*]], ptr [[LAST:%.*]], ptr [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr [[FIRST:%.*]], i8 [[VALUE:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: [[FIRST4:%.*]] = ptrtoint ptr [[FIRST]] to i64
-; CHECK-NEXT: [[LAST3:%.*]] = ptrtoint ptr [[LAST]] to i64
-; CHECK-NEXT: [[FIRST2:%.*]] = ptrtoint ptr [[FIRST]] to i64
-; CHECK-NEXT: [[LAST1:%.*]] = ptrtoint ptr [[LAST]] to i64
-; CHECK-NEXT: [[CMP_NOT6:%.*]] = icmp eq ptr [[FIRST]], [[LAST]]
-; CHECK-NEXT: br i1 [[CMP_NOT6]], label %[[RETURN:.*]], label %[[FOR_BODY_LR_PH:.*]]
-; CHECK: [[FOR_BODY_LR_PH]]:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[VALUE]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[LAST3]], -4
-; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], [[FIRST4]]
-; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 2
-; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2
-; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP6]], i64 15)
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[UMAX]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
-; CHECK: [[VECTOR_SCEVCHECK]]:
-; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[LAST1]] to i2
-; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[FIRST2]] to i2
-; CHECK-NEXT: [[TMP9:%.*]] = sub i2 [[TMP7]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = zext i2 [[TMP9]] to i64
-; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP10]], 0
-; CHECK-NEXT: br i1 [[IDENT_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], [[TMP12]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP0]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[VALUE]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[N_VEC]], [[INDEX]]
-; CHECK-NEXT: [[TMP16:%.*]] = icmp ule i64 [[TMP12]], [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP12]], i64 [[TMP15]]
-; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
-; CHECK-NEXT: [[VP_OP_LOAD_FF:%.*]] = call { <vscale x 4 x i32>, i32 } @llvm.vp.load.ff.nxv4i32.p0(ptr align 4 [[NEXT_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
-; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, i32 } [[VP_OP_LOAD_FF]], 0
-; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x i32>, i32 } [[VP_OP_LOAD_FF]], 1
-; CHECK-NEXT: [[TMP21:%.*]] = icmp eq <vscale x 4 x i32> [[TMP19]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]]
-; CHECK-NEXT: [[TMP23:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
-; CHECK-NEXT: [[TMP24:%.*]] = or i1 [[TMP23]], false
-; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: [[TMP26:%.*]] = or i1 [[TMP24]], [[TMP25]]
-; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[N_VEC]], [[IV]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ule i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP3]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; CHECK-NEXT: [[FIRST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[FIRST]], i64 [[IV]]
+; CHECK-NEXT: [[TMP9:%.*]] = call { <vscale x 16 x i8>, i32 } @llvm.vp.load.ff.nxv16i8.p0(ptr [[FIRST_ADDR]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, i32 } [[TMP9]], 1
+; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, i32 } [[TMP9]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <vscale x 16 x i8> [[TMP12]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP11]])
+; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 16 x i1> [[TMP14]], <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP16:%.*]] = freeze <vscale x 16 x i1> [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP16]])
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_SPLIT]]:
-; CHECK-NEXT: br i1 [[TMP24]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK-NEXT: br i1 [[TMP17]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label %[[RETURN_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT: br label %[[RETURN_LOOPEXIT]]
+; CHECK-NEXT: br label %[[EXIT]]
; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ [[FIRST]], %[[FOR_BODY_LR_PH]] ], [ [[FIRST]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[FIRST_ADDR_07:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], %[[FOR_INC:.*]] ]
-; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[FIRST_ADDR_07]], align 4
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP27]], [[TMP0]]
-; CHECK-NEXT: br i1 [[CMP1]], label %[[RETURN_LOOPEXIT]], label %[[FOR_INC]]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[FIRST]], i64 [[IV1]]
+; CHECK-NEXT: [[TMP26:%.*]] = load i8, ptr [[ADDR]], align 1
+; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP26]], [[VALUE]]
+; CHECK-NEXT: br i1 [[CMP1]], label %[[EXIT]], label %[[FOR_INC]]
; CHECK: [[FOR_INC]]:
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[FIRST_ADDR_07]], i64 1
-; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[LAST]]
-; CHECK-NEXT: br i1 [[CMP_NOT]], label %[[RETURN_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: [[RETURN_LOOPEXIT]]:
-; CHECK-NEXT: [[RETVAL_0_PH:%.*]] = phi i32 [ 0, %[[FOR_BODY]] ], [ 1, %[[FOR_INC]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
-; CHECK-NEXT: br label %[[RETURN]]
-; CHECK: [[RETURN]]:
-; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RETVAL_0_PH]], %[[RETURN_LOOPEXIT]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-NEXT: br i1 [[CMP_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 0, %[[FOR_BODY]] ], [ 1, %[[FOR_INC]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i32 [[RETVAL_0]]
;
entry:
- %cmp.not6 = icmp eq ptr %first, %last
- br i1 %cmp.not6, label %return, label %for.body.lr.ph
-
-for.body.lr.ph:
- %0 = load i32, ptr %value, align 4
br label %for.body
for.body:
- %first.addr.07 = phi ptr [ %first, %for.body.lr.ph ], [ %incdec.ptr, %for.inc ]
- %1 = load i32, ptr %first.addr.07, align 4
- %cmp1 = icmp eq i32 %1, %0
- br i1 %cmp1, label %return.loopexit, label %for.inc
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+ %addr = getelementptr inbounds i8, ptr %first, i64 %iv
+ %1 = load i8, ptr %addr, align 1
+ %cmp1 = icmp eq i8 %1, %value
+ br i1 %cmp1, label %exit, label %for.inc
for.inc:
- %incdec.ptr = getelementptr inbounds i32, ptr %first.addr.07, i64 1
- %cmp.not = icmp eq ptr %incdec.ptr, %last
- br i1 %cmp.not, label %return.loopexit, label %for.body
-
-return.loopexit:
- %retval.0.ph = phi i32 [ 0, %for.body ], [ 1, %for.inc ]
- br label %return
+ %iv.next = add i64 %iv, 1
+ %cmp.not = icmp eq i64 %iv.next, 1024
+ br i1 %cmp.not, label %exit, label %for.body
-return:
- %retval.0 = phi i32 [ 0, %entry ], [ %retval.0.ph, %return.loopexit ]
- ret i32 %retval.0
+exit:
+ %retval = phi i32 [ 0, %for.body ], [ 1, %for.inc ]
+ ret i32 %retval
}
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-load-ff-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-load-ff-intrinsics.ll
new file mode 100644
index 0000000000000..dec8e35202487
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-load-ff-intrinsics.ll
@@ -0,0 +1,45 @@
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -enable-early-exit-with-ffload \
+; RUN: -mtriple=riscv64 -mattr=+v -disable-output < %s 2>&1 | FileCheck %s
+
+define i64 @find_with_liveout(ptr %first, i8 %value) {
+; CHECK: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4,vscale x 8,vscale x 16},UF={1}' {
+; CHECK-NEXT: Live-in ir<1024> = original trip-count
+; CHECK: vector.body:
+; CHECK-NEXT: EMIT-SCALAR vp<[[IV:%.+]]> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT: EMIT vp<[[REMAINDER0:%.+]]> = sub vp<%n.vec>, vp<[[IV]]>
+; CHECK-NEXT: EMIT vp<[[COND:%.+]]> = icmp ule vp<[[VF:%.+]]>, vp<[[REMAINDER0]]>
+; CHECK-NEXT: EMIT vp<[[REMAINDER:%.+]]> = select vp<[[COND]]>, vp<[[VF]]>, vp<[[REMAINDER0]]>
+; CHECK-NEXT: EMIT-SCALAR vp<[[REMAINDER32:%.+]]> = trunc vp<[[REMAINDER]]> to i32
+; CHECK-NEXT: CLONE ir<%addr> = getelementptr inbounds ir<%first>, vp<[[IV]]>
+; CHECK-NEXT: WIDEN-INTRINSIC vp<[[STRUCT:%.+]]> = call llvm.vp.load.ff(ir<%addr>, ir<true>, vp<[[REMAINDER32]]>)
+; CHECK-NEXT: EMIT-SCALAR vp<[[FAULTINGLANE:%.+]]> = extractvalue vp<[[STRUCT]]>, ir<1>
+; CHECK-NEXT: EMIT-SCALAR vp<[[FAULTINGLANE64:%.+]]> = zext vp<[[FAULTINGLANE]]> to i64
+; CHECK-NEXT: WIDEN vp<[[DATA:%.+]]> = extractvalue vp<[[STRUCT]]>, ir<0>
+; CHECK-NEXT: WIDEN ir<%cmp1> = icmp eq vp<[[DATA]]>, vp<[[VALUE:%.+]]>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[IV]]>, vp<[[FAULTINGLANE64]]>
+; CHECK-NEXT: EMIT vp<[[ALM:%.+]]> = active lane mask ir<0>, vp<[[FAULTINGLANE64]]>, ir<1>
+; CHECK-NEXT: EMIT vp<[[ALM1:%.+]]> = logical-and vp<[[ALM]]>, ir<%cmp1>
+; CHECK-NEXT: EMIT vp<[[EARLYEXIT:%.+]]> = any-of vp<[[ALM1]]>
+; CHECK-NEXT: EMIT vp<[[MAINEXIT:%.+]]> = icmp eq vp<%index.next>, vp<%n.vec>
+; CHECK-NEXT: EMIT vp<[[EXIT:%.+]]> = or vp<[[EARLYEXIT]]>, vp<[[MAINEXIT]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EXIT]]>
+; CHECK-NEXT: Successor(s): middle.split, vector.body
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+ %addr = getelementptr inbounds i8, ptr %first, i64 %iv
+ %1 = load i8, ptr %addr, align 1
+ %cmp1 = icmp eq i8 %1, %value
+ br i1 %cmp1, label %exit, label %for.inc
+
+for.inc:
+ %iv.next = add i64 %iv, 1
+ %cmp.not = icmp eq i64 %iv.next, 1024
+ br i1 %cmp.not, label %exit, label %for.body
+
+exit:
+ %retval = phi i64 [ %iv, %for.body ], [ 1024, %for.inc ]
+ ret i64 %retval
+}
>From deb51fd5ee4447d245b3cb9b03a719bde40bb21f Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Sun, 26 Oct 2025 23:38:48 -0700
Subject: [PATCH 4/4] Fix error after merge
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b939f82bf6b1a..1f6947b0b7a5a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2853,13 +2853,14 @@ void VPlanTransforms::adjustFFLoadEarlyExitForPoisonSafety(VPlan &Plan) {
// Set AVL to min(VF, remainder).
VPBuilder Builder(Header, Header->getFirstNonPhi());
DebugLoc DL = LastFFLoad->getDebugLoc();
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
VPValue *Remainder = Builder.createNaryOp(
- Instruction::Sub, {&Plan.getVectorTripCount(), Plan.getCanonicalIV()},
- DL);
+ Instruction::Sub, {&Plan.getVectorTripCount(), CanonicalIVPHI}, DL);
VPValue *Cmp =
Builder.createICmp(CmpInst::ICMP_ULE, &Plan.getVF(), Remainder, DL);
VPValue *AVL = Builder.createSelect(Cmp, &Plan.getVF(), Remainder, DL);
- Type *CanIVTy = Plan.getCanonicalIV()->getScalarType();
+ Type *CanIVTy = CanonicalIVPHI->getScalarType();
Type *I32Ty = IntegerType::getInt32Ty(Plan.getContext());
AVL = Builder.createScalarZExtOrTrunc(AVL, I32Ty, CanIVTy, DL);
LastFFLoad->setOperand(2, AVL);
More information about the llvm-commits
mailing list