[llvm] [VPlan] Enable vectorization of early-exit loops with unit-stride fault-only-first loads (PR #151300)
Shih-Po Hung via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 10 18:55:36 PST 2025
https://github.com/arcbbb updated https://github.com/llvm/llvm-project/pull/151300
>From 34692f5f055b0bfb65b539fb5bf64b6da6ffdf72 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Wed, 3 Dec 2025 00:34:16 -0800
Subject: [PATCH 1/2] [VPlan] Support struct return types for widen intrinsics
(NFC)
---
llvm/include/llvm/IR/VectorTypeUtils.h | 15 ++++++++++++---
llvm/lib/Analysis/VectorUtils.cpp | 4 ++++
llvm/lib/IR/VectorTypeUtils.cpp | 16 ++++++++++++++--
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +++++-
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 14 ++++++++++++--
llvm/unittests/IR/VectorTypeUtilsTest.cpp | 7 +++++++
6 files changed, 54 insertions(+), 8 deletions(-)
diff --git a/llvm/include/llvm/IR/VectorTypeUtils.h b/llvm/include/llvm/IR/VectorTypeUtils.h
index e3d7fadad6089..3db5c4a2b6576 100644
--- a/llvm/include/llvm/IR/VectorTypeUtils.h
+++ b/llvm/include/llvm/IR/VectorTypeUtils.h
@@ -14,6 +14,11 @@
namespace llvm {
+/// Returns true if \p IID is a vector intrinsic that returns a struct with a
+/// scalar element at index \p EleIdx.
+LLVM_ABI bool isVectorIntrinsicWithStructReturnScalarAtField(unsigned IID,
+ unsigned EleIdx);
+
/// A helper function for converting Scalar types to vector types. If
/// the incoming type is void, we return void. If the EC represents a
/// scalar, we return the scalar type.
@@ -31,7 +36,11 @@ inline Type *toVectorTy(Type *Scalar, unsigned VF) {
/// Note:
/// - If \p EC is scalar, \p StructTy is returned unchanged
/// - Only unpacked literal struct types are supported
-LLVM_ABI Type *toVectorizedStructTy(StructType *StructTy, ElementCount EC);
+/// vector types.
+/// - If IID (Intrinsic ID) is provided, only fields that are vector types
+/// are widened.
+LLVM_ABI Type *toVectorizedStructTy(StructType *StructTy, ElementCount EC,
+ unsigned IID = 0);
/// A helper for converting structs of vector types to structs of scalar types.
/// Note: Only unpacked literal struct types are supported.
@@ -52,9 +61,9 @@ LLVM_ABI bool canVectorizeStructTy(StructType *StructTy);
/// - If the incoming type is void, we return void
/// - If \p EC is scalar, \p Ty is returned unchanged
/// - Only unpacked literal struct types are supported
-inline Type *toVectorizedTy(Type *Ty, ElementCount EC) {
+inline Type *toVectorizedTy(Type *Ty, ElementCount EC, unsigned IID = 0) {
if (StructType *StructTy = dyn_cast<StructType>(Ty))
- return toVectorizedStructTy(StructTy, EC);
+ return toVectorizedStructTy(StructTy, EC, IID);
return toVectorTy(Ty, EC);
}
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index a3e9b039f9225..85d1516f40c4c 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -224,6 +224,10 @@ bool llvm::isVectorIntrinsicWithStructReturnOverloadAtField(
return TTI->isTargetIntrinsicWithStructReturnOverloadAtField(ID, RetIdx);
switch (ID) {
+ case Intrinsic::modf:
+ case Intrinsic::sincos:
+ case Intrinsic::sincospi:
+ return false;
case Intrinsic::frexp:
return RetIdx == 0 || RetIdx == 1;
default:
diff --git a/llvm/lib/IR/VectorTypeUtils.cpp b/llvm/lib/IR/VectorTypeUtils.cpp
index 62e39aab90079..5b0b62dac1473 100644
--- a/llvm/lib/IR/VectorTypeUtils.cpp
+++ b/llvm/lib/IR/VectorTypeUtils.cpp
@@ -8,12 +8,21 @@
#include "llvm/IR/VectorTypeUtils.h"
#include "llvm/ADT/SmallVectorExtras.h"
+#include "llvm/IR/Intrinsics.h"
using namespace llvm;
+bool llvm::isVectorIntrinsicWithStructReturnScalarAtField(unsigned IID,
+ unsigned EleIdx) {
+ if (IID == Intrinsic::vp_load_ff)
+ return EleIdx == 1;
+ return false;
+}
+
/// A helper for converting structs of scalar types to structs of vector types.
/// Note: Only unpacked literal struct types are supported.
-Type *llvm::toVectorizedStructTy(StructType *StructTy, ElementCount EC) {
+Type *llvm::toVectorizedStructTy(StructType *StructTy, ElementCount EC,
+ unsigned IID) {
if (EC.isScalar())
return StructTy;
assert(isUnpackedStructLiteral(StructTy) &&
@@ -22,7 +31,10 @@ Type *llvm::toVectorizedStructTy(StructType *StructTy, ElementCount EC) {
"expected all element types to be valid vector element types");
return StructType::get(
StructTy->getContext(),
- map_to_vector(StructTy->elements(), [&](Type *ElTy) -> Type * {
+ map_to_vector(enumerate(StructTy->elements()), [&](auto It) -> Type * {
+ Type *ElTy = It.value();
+ if (isVectorIntrinsicWithStructReturnScalarAtField(IID, It.index()))
+ return ElTy;
return VectorType::get(ElTy, EC);
}));
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 15d0fa41bd902..59d7a0864276c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4163,7 +4163,11 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
if (!Visited.insert({ScalarTy}).second)
continue;
- Type *WideTy = toVectorizedTy(ScalarTy, VF);
+ unsigned IID = 0;
+ if (auto *WI = dyn_cast<VPWidenIntrinsicRecipe>(&R))
+ WI->getVectorIntrinsicID();
+ Type *WideTy = toVectorizedTy(ScalarTy, VF, IID);
+
if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
return true;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b0c8564ad231a..41c3795b54e6e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1695,7 +1695,16 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
SmallVector<Type *, 2> TysForDecl;
// Add return type if intrinsic is overloaded on it.
- if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
+ if (ResultTy->isStructTy()) {
+ auto *StructTy = cast<StructType>(ResultTy);
+ for (unsigned I = 0, E = StructTy->getNumElements(); I != E; ++I) {
+ if (isVectorIntrinsicWithStructReturnOverloadAtField(VectorIntrinsicID, I,
+ State.TTI))
+ TysForDecl.push_back(
+ toVectorizedTy(StructTy->getStructElementType(I), State.VF));
+ }
+ } else if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1,
+ State.TTI))
TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
SmallVector<Value *, 4> Args;
for (const auto &I : enumerate(operands())) {
@@ -1760,7 +1769,8 @@ static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
}
Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
- Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
+ Type *RetTy =
+ VF.isVector() ? toVectorizedTy(ScalarRetTy, VF, ID) : ScalarRetTy;
SmallVector<Type *> ParamTys;
for (const VPValue *Op : Operands) {
ParamTys.push_back(VF.isVector()
diff --git a/llvm/unittests/IR/VectorTypeUtilsTest.cpp b/llvm/unittests/IR/VectorTypeUtilsTest.cpp
index c77f183e921de..5d4d30afa8fb0 100644
--- a/llvm/unittests/IR/VectorTypeUtilsTest.cpp
+++ b/llvm/unittests/IR/VectorTypeUtilsTest.cpp
@@ -8,6 +8,7 @@
#include "llvm/IR/VectorTypeUtils.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
#include "gtest/gtest.h"
@@ -24,6 +25,7 @@ TEST(VectorTypeUtilsTest, TestToVectorizedTy) {
Type *FTy = Type::getFloatTy(C);
Type *HomogeneousStructTy = StructType::get(FTy, FTy, FTy);
Type *MixedStructTy = StructType::get(FTy, ITy);
+ Type *FFLoadRetTy = StructType::get(ITy, ITy);
Type *VoidTy = Type::getVoidTy(C);
for (ElementCount VF :
@@ -54,6 +56,11 @@ TEST(VectorTypeUtilsTest, TestToVectorizedTy) {
VectorType::get(ITy, VF));
EXPECT_EQ(toVectorizedTy(VoidTy, VF), VoidTy);
+ Type *WidenFFLoadRetTy =
+ toVectorizedTy(FFLoadRetTy, VF, Intrinsic::vp_load_ff);
+ EXPECT_EQ(cast<StructType>(WidenFFLoadRetTy)->getElementType(0),
+ VectorType::get(ITy, VF));
+ EXPECT_EQ(cast<StructType>(WidenFFLoadRetTy)->getElementType(1), ITy);
}
ElementCount ScalarVF = ElementCount::getFixed(1);
>From 6a3dd4d701b10f17d1bde943a3c3130dd5026464 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Wed, 3 Dec 2025 00:31:40 -0800
Subject: [PATCH 2/2] Support WidenFFLoad in early-exit loop
---
llvm/lib/Analysis/VectorUtils.cpp | 4 +
.../Transforms/Vectorize/LoopVectorize.cpp | 55 +++++-
.../Transforms/Vectorize/VPRecipeBuilder.h | 2 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 4 +
.../Transforms/Vectorize/VPlanAnalysis.cpp | 7 +
llvm/lib/Transforms/Vectorize/VPlanHelpers.h | 3 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 24 +++
.../Transforms/Vectorize/VPlanTransforms.cpp | 131 +++++++++++++
.../Transforms/Vectorize/VPlanTransforms.h | 11 ++
.../Transforms/LoopVectorize/RISCV/find.ll | 172 ++++++++++++++++++
.../RISCV/vplan-vp-load-ff-intrinsics.ll | 45 +++++
11 files changed, 453 insertions(+), 5 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/find.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-load-ff-intrinsics.ll
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 85d1516f40c4c..7c09ee108b00e 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -175,6 +175,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
return (ScalarOpdIdx == 2);
case Intrinsic::experimental_vp_splice:
return ScalarOpdIdx == 2 || ScalarOpdIdx == 4;
+ case Intrinsic::vp_load_ff:
+ return ScalarOpdIdx == 0 || ScalarOpdIdx == 2;
default:
return false;
}
@@ -212,6 +214,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
case Intrinsic::powi:
case Intrinsic::ldexp:
return OpdIdx == -1 || OpdIdx == 1;
+ case Intrinsic::vp_load_ff:
+ return OpdIdx == 0;
default:
return OpdIdx == -1;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 59d7a0864276c..c965c7bc89073 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -399,6 +399,12 @@ static cl::opt<bool> EnableEarlyExitVectorization(
cl::desc(
"Enable vectorization of early exit loops with uncountable exits."));
+static cl::opt<bool>
+ EnableEarlyExitWithFFLoads("enable-early-exit-with-ffload", cl::init(false),
+ cl::Hidden,
+ cl::desc("Enable vectorization of early-exit "
+ "loops with fault-only-first loads."));
+
static cl::opt<bool> ConsiderRegPressure(
"vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden,
cl::desc("Discard VFs if their register pressure is too high."));
@@ -3551,6 +3557,15 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return FixedScalableVFPair::getNone();
}
+ if (!Legal->getPotentiallyFaultingLoads().empty() && UserIC > 1) {
+ reportVectorizationFailure("Auto-vectorization of loops with potentially "
+ "faulting loads is not supported when the "
+ "interleave count is more than 1",
+ "CantInterleaveLoopWithPotentiallyFaultingLoads",
+ ORE, TheLoop);
+ return FixedScalableVFPair::getNone();
+ }
+
ScalarEvolution *SE = PSE.getSE();
ElementCount TC = getSmallConstantTripCount(SE, TheLoop);
unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
@@ -4630,6 +4645,10 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
if (!Legal->isSafeForAnyVectorWidth())
return 1;
+ // No interleaving for potentially faulting loads.
+ if (!Legal->getPotentiallyFaultingLoads().empty())
+ return 1;
+
// We don't attempt to perform interleaving for loops with uncountable early
// exits because the VPInstruction::AnyOf code cannot currently handle
// multiple parts.
@@ -7386,6 +7405,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// Regions are dissolved after optimizing for VF and UF, which completely
// removes unneeded loop regions first.
VPlanTransforms::dissolveLoopRegions(BestVPlan);
+
+ VPlanTransforms::convertFFLoadEarlyExitToVLStepping(BestVPlan);
+
// Canonicalize EVL loops after regions are dissolved.
VPlanTransforms::canonicalizeEVLLoops(BestVPlan);
VPlanTransforms::materializeBackedgeTakenCount(BestVPlan, VectorPH);
@@ -7614,8 +7636,8 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
});
}
-VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
- VFRange &Range) {
+VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
+ VFRange &Range) {
assert((VPI->getOpcode() == Instruction::Load ||
VPI->getOpcode() == Instruction::Store) &&
"Must be called with either a load or store");
@@ -7676,6 +7698,23 @@ VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
Builder.insert(VectorPtr);
Ptr = VectorPtr;
}
+
+ if (Legal->getPotentiallyFaultingLoads().contains(I)) {
+ auto *I32Ty = IntegerType::getInt32Ty(Plan.getContext());
+ auto *RetTy = StructType::get(I->getType(), I32Ty);
+ DebugLoc DL = I->getDebugLoc();
+ if (!Mask)
+ Mask = Plan.getOrAddLiveIn(
+ ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext())));
+ auto *FFLoad = new VPWidenIntrinsicRecipe(Intrinsic::vp_load_ff,
+ {Ptr, Mask, &Plan.getVF()}, RetTy,
+ *VPI, *VPI, DL);
+ Builder.insert(FFLoad);
+ VPValue *Zero = Plan.getConstantInt(32, 0);
+ return new VPInstruction(VPInstruction::ExtractVectorValue, {FFLoad, Zero},
+ {}, {}, DL);
+ }
+
if (VPI->getOpcode() == Instruction::Load) {
auto *Load = cast<LoadInst>(I);
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, *VPI,
@@ -8621,6 +8660,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
if (!VPlanTransforms::runPass(VPlanTransforms::handleMultiUseReductions,
*Plan))
return nullptr;
+
+ VPlanTransforms::adjustFFLoadEarlyExitForPoisonSafety(*Plan);
+
// Apply mandatory transformation to handle FP maxnum/minnum reduction with
// NaNs if possible, bail out otherwise.
if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions,
@@ -9952,7 +9994,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
- if (!LVL.getPotentiallyFaultingLoads().empty()) {
+ if (EnableEarlyExitWithFFLoads) {
+ if (LVL.getPotentiallyFaultingLoads().size() > 1) {
+ reportVectorizationFailure("Auto-vectorization of loops with more than 1 "
+ "potentially faulting load is not enabled",
+ "MoreThanOnePotentiallyFaultingLoad", ORE, L);
+ return false;
+ }
+ } else if (!LVL.getPotentiallyFaultingLoads().empty()) {
reportVectorizationFailure("Auto-vectorization of loops with potentially "
"faulting load is not supported",
"PotentiallyFaultingLoadsNotSupported", ORE, L);
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 1808be118cd2a..59733143c31a2 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -92,7 +92,7 @@ class VPRecipeBuilder {
/// Check if the load or store instruction \p VPI should widened for \p
/// Range.Start and potentially masked. Such instructions are handled by a
/// recipe that takes an additional VPInstruction for the mask.
- VPWidenMemoryRecipe *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
+ VPRecipeBase *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
/// Check if an induction recipe should be constructed for \p VPI. If so build
/// and return it. If not, return null.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 2ef7b5405668d..556c4def62a61 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1125,6 +1125,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
/// The lane specifies an index into a vector formed by combining all vector
/// operands (all operands after the first one).
ExtractLane,
+ // Extracts a scalar value from an aggregate value.
+ ExtractScalarValue,
+ // Extracts a vector value from an aggregate value.
+ ExtractVectorValue,
/// Explicit user for the resume phi of the canonical induction in the main
/// VPlan, used by the epilogue vector loop.
ResumeForEpilogue,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index c64b97579881a..00b45b6e99e5a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -141,6 +141,13 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::BranchOnCond:
case VPInstruction::BranchOnCount:
return Type::getVoidTy(Ctx);
+ case VPInstruction::ExtractScalarValue:
+ case VPInstruction::ExtractVectorValue: {
+ assert(R->getNumOperands() == 2 && "expected single level extractvalue");
+ auto *StructTy = cast<StructType>(inferScalarType(R->getOperand(0)));
+ auto *CI = cast<ConstantInt>(R->getOperand(1)->getLiveInIRValue());
+ return StructTy->getTypeAtIndex(CI->getZExtValue());
+ }
default:
break;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index c84e62059c64b..dda349e0480c9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -237,7 +237,8 @@ struct VPTransformState {
set(Def, V, VPLane(0));
return;
}
- assert((VF.isScalar() || isVectorizedTy(V->getType())) &&
+ assert((VF.isScalar() || isVectorizedTy(V->getType()) ||
+ V->getType()->isStructTy()) &&
"scalar values must be stored as (0, 0)");
Data.VPV2Vector[Def] = V;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 41c3795b54e6e..37b83bd0c293c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -453,6 +453,8 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::BranchOnCount:
case VPInstruction::ComputeReductionResult:
case VPInstruction::ExtractLane:
+ case VPInstruction::ExtractScalarValue:
+ case VPInstruction::ExtractVectorValue:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::PtrAdd:
@@ -832,6 +834,13 @@ Value *VPInstruction::generate(VPTransformState &State) {
Res->setName(Name);
return Res;
}
+ case VPInstruction::ExtractVectorValue:
+ case VPInstruction::ExtractScalarValue: {
+ assert(getNumOperands() == 2 && "expected single level extractvalue");
+ Value *Op = State.get(getOperand(0));
+ auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
+ return Builder.CreateExtractValue(Op, CI->getZExtValue());
+ }
case VPInstruction::LogicalAnd: {
Value *A = State.get(getOperand(0));
Value *B = State.get(getOperand(1));
@@ -1138,6 +1147,7 @@ bool VPInstruction::isVectorToScalar() const {
bool VPInstruction::isSingleScalar() const {
switch (getOpcode()) {
case Instruction::PHI:
+ case VPInstruction::ExtractScalarValue:
case VPInstruction::ExplicitVectorLength:
case VPInstruction::ResumeForEpilogue:
case VPInstruction::VScale:
@@ -1349,6 +1359,12 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
case VPInstruction::ExtractPenultimateElement:
O << "extract-penultimate-element";
break;
+ case VPInstruction::ExtractScalarValue:
+ O << "extract-scalar-value";
+ break;
+ case VPInstruction::ExtractVectorValue:
+ O << "extract-vector-value";
+ break;
case VPInstruction::ComputeAnyOfResult:
O << "compute-anyof-result";
break;
@@ -1791,6 +1807,14 @@ static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
SmallVector<const VPValue *> ArgOps(operands());
+ if (VectorIntrinsicID == Intrinsic::vp_load_ff) {
+ auto *StructTy = cast<StructType>(ResultTy);
+ Type *DataTy = toVectorizedTy(StructTy->getStructElementType(0), VF);
+ // TODO: Infer alignment from pointer.
+ Align Alignment;
+ return Ctx.TTI.getMemIntrinsicInstrCost(
+ {VectorIntrinsicID, DataTy, Alignment}, Ctx.CostKind);
+ }
return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 852196e589c59..b4773abc850a9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3144,6 +3144,137 @@ void VPlanTransforms::addExplicitVectorLength(
Plan.setUF(1);
}
+void VPlanTransforms::adjustFFLoadEarlyExitForPoisonSafety(VPlan &Plan) {
+ using namespace SCEVPatternMatch;
+ VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ VPWidenIntrinsicRecipe *LastFFLoad = nullptr;
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_deep(Plan.getVectorLoopRegion())))
+ for (VPRecipeBase &R : *VPBB)
+ if (match(&R, m_Intrinsic<Intrinsic::vp_load_ff>(m_VPValue(), m_VPValue(),
+ m_VPValue()))) {
+ assert(!LastFFLoad && "Only one FFLoad is supported");
+ LastFFLoad = cast<VPWidenIntrinsicRecipe>(&R);
+ }
+
+ // Skip if no FFLoad.
+ if (!LastFFLoad)
+ return;
+
+ // Ensure FFLoad does not read past the remainder in the last iteration.
+ // Set AVL to min(VF, remainder).
+ VPBuilder Builder(Header, Header->getFirstNonPhi());
+ DebugLoc DL = LastFFLoad->getDebugLoc();
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
+ VPValue *Remainder = Builder.createNaryOp(
+ Instruction::Sub, {&Plan.getVectorTripCount(), CanonicalIVPHI}, DL);
+ VPValue *Cmp =
+ Builder.createICmp(CmpInst::ICMP_ULE, &Plan.getVF(), Remainder, DL);
+ VPValue *AVL = Builder.createSelect(Cmp, &Plan.getVF(), Remainder, DL);
+ Type *CanIVTy = CanonicalIVPHI->getScalarType();
+ Type *I32Ty = IntegerType::getInt32Ty(Plan.getContext());
+ AVL = Builder.createScalarZExtOrTrunc(AVL, I32Ty, CanIVTy, DL);
+ LastFFLoad->setOperand(2, AVL);
+
+ // To prevent branch-on-poison, mask the early-exit condition with
+ // active-lane-mask. Expected pattern here is:
+ // Before:
+ // EMIT vp<%alt.exit.cond> = any-of vp<%cond>
+ // EMIT vp<%exit.cond> = or vp<%alt.exit.cond>, vp<%main.exit.cond>
+ // EMIT branch-on-cond vp<%exit.cond>
+ // After:
+ // EMIT vp<%faulting.lane> = extractvalue vp<%ffload>, 1
+ // EMIT vp<%alm> = active lane mask 0, vp<%faulting.lane>
+ // EMIT vp<%and> = logical-and vp<%alm>, vp<%cond>
+ // EMIT vp<%alt.exit.cond> = any-of vp<%and>
+ // EMIT vp<%exit.cond> = or vp<%alt.exit.cond>, vp<%main.exit.cond>
+ // EMIT branch-on-cond vp<%exit.cond>
+ auto *ExitingLatch =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
+ auto *LatchExitingBr = cast<VPInstruction>(ExitingLatch->getTerminator());
+
+ VPValue *VPAnyOf = nullptr;
+ VPValue *VecOp = nullptr;
+ [[maybe_unused]] bool IsExitingOnAnyOfOr =
+ match(LatchExitingBr,
+ m_BranchOnCond(m_BinaryOr(m_VPValue(VPAnyOf), m_VPValue()))) &&
+ match(VPAnyOf, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(VecOp)));
+ assert(IsExitingOnAnyOfOr &&
+ "unexpected exiting sequence in early exit loop");
+
+ // Creates the VPValue for the index of the faulting lane.
+ VPRecipeBase *AnyOfR = VPAnyOf->getDefiningRecipe();
+ Builder.setInsertPoint(cast<VPRecipeBase>(*LastFFLoad->user_begin()));
+ VPValue *One = Plan.getConstantInt(32, 1);
+ VPValue *FaultingLane = Builder.createNaryOp(
+ VPInstruction::ExtractScalarValue, {LastFFLoad, One}, DL);
+ FaultingLane =
+ Builder.createScalarZExtOrTrunc(FaultingLane, CanIVTy, I32Ty, DL);
+ VPValue *ALMMultiplier = Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, 1));
+ Builder.setInsertPoint(AnyOfR);
+ DL = AnyOfR->getDebugLoc();
+ auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, 0));
+ auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
+ {Zero, FaultingLane, ALMMultiplier}, DL);
+ auto *R = Builder.createNaryOp(VPInstruction::LogicalAnd, {ALM, VecOp}, DL);
+ AnyOfR->setOperand(0, R);
+
+ // Using FirstActiveLane in the early-exit block is safe,
+ // exiting conditions guarantees at least one valid lane precedes
+ // any poisoned lanes.
+}
+
+void VPlanTransforms::convertFFLoadEarlyExitToVLStepping(VPlan &Plan) {
+ using namespace SCEVPatternMatch;
+ // Find loop header by locating FFLoad.
+ VPWidenIntrinsicRecipe *LastFFLoad = nullptr;
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(Plan.getEntry())))
+ for (VPRecipeBase &R : *VPBB)
+ if (match(&R, m_Intrinsic<Intrinsic::vp_load_ff>(m_VPValue(), m_VPValue(),
+ m_VPValue()))) {
+ assert(!LastFFLoad && "Only one FFLoad is supported");
+ LastFFLoad = cast<VPWidenIntrinsicRecipe>(&R);
+ }
+
+ // Skip if no FFLoad.
+ if (!LastFFLoad)
+ return;
+
+ VPBasicBlock *HeaderVPBB = LastFFLoad->getParent();
+ // Replace IVStep (VFxUF) with returned faultnig lane from FFLoad.
+ auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
+ VPValue *Backedge = CanonicalIV->getIncomingValue(1);
+ assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
+ m_Specific(&Plan.getVFxUF()))) &&
+ "Unexpected canonical iv");
+ VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
+ // Expected pattern
+ // EMIT vp<%alm> = active lane mask 0, vp<%faulting.lane>
+ // EMIT vp<%and> = logical-and vp<%alm>, vp<%cond>
+ // EMIT vp<%alt.exit.cond> = any-of vp<%and>
+ // EMIT vp<%exit.cond> = or vp<%alt.exit.cond>, vp<%main.exit.cond>
+ // EMIT branch-on-cond vp<%exit.cond>
+ // use the index to step the iv
+ VPBasicBlock *LatchExiting =
+ HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
+ auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
+ VPValue *VPAnyOf = nullptr;
+ VPValue *FaultingLane = nullptr;
+ [[maybe_unused]] bool IsExitingOnAnyOfOr =
+ match(LatchExitingBr,
+ m_BranchOnCond(m_BinaryOr(m_VPValue(VPAnyOf), m_VPValue()))) &&
+ match(VPAnyOf,
+ m_VPInstruction<VPInstruction::AnyOf>(
+ m_VPInstruction<VPInstruction::LogicalAnd>(
+ m_VPInstruction<VPInstruction::ActiveLaneMask>(
+ m_ZeroInt(), m_VPValue(FaultingLane), m_VPValue()),
+ m_VPValue())));
+
+ CanonicalIVIncrement->setOperand(1, FaultingLane);
+}
+
void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
// Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
// There should be only one EVL PHI in the entire plan.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index afdf1655b4622..e640d58755b70 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -283,6 +283,17 @@ struct VPlanTransforms {
/// (branch-on-cond eq AVLNext, 0)
static void canonicalizeEVLLoops(VPlan &Plan);
+ /// Applies to early-exit loops that use FFLoad. FFLoad may yield fewer active
+ /// lanes than VF. To prevent branch-on-poison and over-reads past the vector
+ /// trip count, use the returned VL for both stepping and exit computation.
+ /// Implemented by:
+ /// - adjustFFLoadEarlyExitForPoisonSafety: replace AnyOf with vp.reduce.or
+ /// over the first VL lanes; set AVL = min(VF, remainder).
+ /// - convertFFLoadEarlyExitToVLStepping: after region dissolution, convert
+ /// early-exit loops to variable-length stepping.
+ static void adjustFFLoadEarlyExitForPoisonSafety(VPlan &Plan);
+ static void convertFFLoadEarlyExitToVLStepping(VPlan &Plan);
+
/// Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToConcreteRecipes(VPlan &Plan);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/find.ll b/llvm/test/Transforms/LoopVectorize/RISCV/find.ll
new file mode 100644
index 0000000000000..1609b0acb74ca
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/find.ll
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -enable-early-exit-with-ffload -mtriple=riscv64 -mattr=+v -S %s 2>&1 | FileCheck %s
+
+define i64 @find_with_liveout(ptr %first, i8 %value) {
+; CHECK-LABEL: define i64 @find_with_liveout(
+; CHECK-SAME: ptr [[FIRST:%.*]], i8 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[VALUE]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[N_VEC]], [[IV]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ule i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP3]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; CHECK-NEXT: [[FIRST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[FIRST]], i64 [[IV]]
+; CHECK-NEXT: [[TMP9:%.*]] = call { <vscale x 16 x i8>, i32 } @llvm.vp.load.ff.nxv16i8.p0(ptr [[FIRST_ADDR]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, i32 } [[TMP9]], 1
+; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, i32 } [[TMP9]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <vscale x 16 x i8> [[TMP12]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP11]])
+; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 16 x i1> [[TMP14]], <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP16:%.*]] = freeze <vscale x 16 x i1> [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP16]])
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_SPLIT]]:
+; CHECK-NEXT: br i1 [[TMP17]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP13]], i1 false)
+; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[IV]], [[TMP20]]
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[FIRST]], i64 [[IV1]]
+; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ADDR]], align 1
+; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP29]], [[VALUE]]
+; CHECK-NEXT: br i1 [[CMP1]], label %[[EXIT]], label %[[FOR_INC]]
+; CHECK: [[FOR_INC]]:
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-NEXT: br i1 [[CMP_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[IV1]], %[[FOR_BODY]] ], [ 1024, %[[FOR_INC]] ], [ 1024, %[[MIDDLE_BLOCK]] ], [ [[TMP21]], %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT: ret i64 [[RETVAL]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+ %addr = getelementptr inbounds i8, ptr %first, i64 %iv
+ %1 = load i8, ptr %addr, align 1
+ %cmp1 = icmp eq i8 %1, %value
+ br i1 %cmp1, label %exit, label %for.inc
+
+for.inc:
+ %iv.next = add i64 %iv, 1
+ %cmp.not = icmp eq i64 %iv.next, 1024
+ br i1 %cmp.not, label %exit, label %for.body
+
+exit:
+ %retval = phi i64 [ %iv, %for.body ], [ 1024, %for.inc ]
+ ret i64 %retval
+}
+
+define i32 @find_without_liveout(ptr %first, i8 %value) {
+; CHECK-LABEL: define i32 @find_without_liveout(
+; CHECK-SAME: ptr [[FIRST:%.*]], i8 [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[VALUE]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[N_VEC]], [[IV]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ule i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP3]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; CHECK-NEXT: [[FIRST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[FIRST]], i64 [[IV]]
+; CHECK-NEXT: [[TMP9:%.*]] = call { <vscale x 16 x i8>, i32 } @llvm.vp.load.ff.nxv16i8.p0(ptr [[FIRST_ADDR]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, i32 } [[TMP9]], 1
+; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, i32 } [[TMP9]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <vscale x 16 x i8> [[TMP12]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP11]])
+; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 16 x i1> [[TMP14]], <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP16:%.*]] = freeze <vscale x 16 x i1> [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP16]])
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_SPLIT]]:
+; CHECK-NEXT: br i1 [[TMP17]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[FIRST]], i64 [[IV1]]
+; CHECK-NEXT: [[TMP26:%.*]] = load i8, ptr [[ADDR]], align 1
+; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP26]], [[VALUE]]
+; CHECK-NEXT: br i1 [[CMP1]], label %[[EXIT]], label %[[FOR_INC]]
+; CHECK: [[FOR_INC]]:
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-NEXT: br i1 [[CMP_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 0, %[[FOR_BODY]] ], [ 1, %[[FOR_INC]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT: ret i32 [[RETVAL_0]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+ %addr = getelementptr inbounds i8, ptr %first, i64 %iv
+ %1 = load i8, ptr %addr, align 1
+ %cmp1 = icmp eq i8 %1, %value
+ br i1 %cmp1, label %exit, label %for.inc
+
+for.inc:
+ %iv.next = add i64 %iv, 1
+ %cmp.not = icmp eq i64 %iv.next, 1024
+ br i1 %cmp.not, label %exit, label %for.body
+
+exit:
+ %retval = phi i32 [ 0, %for.body ], [ 1, %for.inc ]
+ ret i32 %retval
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-load-ff-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-load-ff-intrinsics.ll
new file mode 100644
index 0000000000000..53ed9c17439af
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-load-ff-intrinsics.ll
@@ -0,0 +1,45 @@
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -enable-early-exit-with-ffload \
+; RUN: -mtriple=riscv64 -mattr=+v -disable-output < %s 2>&1 | FileCheck %s
+
+define i64 @find_with_liveout(ptr %first, i8 %value) {
+; CHECK: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4,vscale x 8,vscale x 16},UF={1}' {
+; CHECK-NEXT: Live-in ir<1024> = original trip-count
+; CHECK: vector.body:
+; CHECK-NEXT: EMIT-SCALAR vp<[[IV:%.+]]> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT: EMIT vp<[[REMAINDER0:%.+]]> = sub vp<%n.vec>, vp<[[IV]]>
+; CHECK-NEXT: EMIT vp<[[COND:%.+]]> = icmp ule vp<[[VF:%.+]]>, vp<[[REMAINDER0]]>
+; CHECK-NEXT: EMIT vp<[[REMAINDER:%.+]]> = select vp<[[COND]]>, vp<[[VF]]>, vp<[[REMAINDER0]]>
+; CHECK-NEXT: EMIT-SCALAR vp<[[REMAINDER32:%.+]]> = trunc vp<[[REMAINDER]]> to i32
+; CHECK-NEXT: CLONE ir<%addr> = getelementptr inbounds ir<%first>, vp<[[IV]]>
+; CHECK-NEXT: WIDEN-INTRINSIC vp<[[STRUCT:%.+]]> = call llvm.vp.load.ff(ir<%addr>, ir<true>, vp<[[REMAINDER32]]>)
+; CHECK-NEXT: EMIT-SCALAR vp<[[FAULTINGLANE:%.+]]> = extract-scalar-value vp<[[STRUCT]]>, ir<1>
+; CHECK-NEXT: EMIT-SCALAR vp<[[FAULTINGLANE64:%.+]]> = zext vp<[[FAULTINGLANE]]> to i64
+; CHECK-NEXT: EMIT vp<[[DATA:%.+]]> = extract-vector-value vp<[[STRUCT]]>, ir<0>
+; CHECK-NEXT: WIDEN ir<%cmp1> = icmp eq vp<[[DATA]]>, vp<[[VALUE:%.+]]>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[IV]]>, vp<[[FAULTINGLANE64]]>
+; CHECK-NEXT: EMIT vp<[[ALM:%.+]]> = active lane mask ir<0>, vp<[[FAULTINGLANE64]]>, ir<1>
+; CHECK-NEXT: EMIT vp<[[ALM1:%.+]]> = logical-and vp<[[ALM]]>, ir<%cmp1>
+; CHECK-NEXT: EMIT vp<[[EARLYEXIT:%.+]]> = any-of vp<[[ALM1]]>
+; CHECK-NEXT: EMIT vp<[[MAINEXIT:%.+]]> = icmp eq vp<%index.next>, vp<%n.vec>
+; CHECK-NEXT: EMIT vp<[[EXIT:%.+]]> = or vp<[[EARLYEXIT]]>, vp<[[MAINEXIT]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EXIT]]>
+; CHECK-NEXT: Successor(s): middle.split, vector.body
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+ %addr = getelementptr inbounds i8, ptr %first, i64 %iv
+ %1 = load i8, ptr %addr, align 1
+ %cmp1 = icmp eq i8 %1, %value
+ br i1 %cmp1, label %exit, label %for.inc
+
+for.inc:
+ %iv.next = add i64 %iv, 1
+ %cmp.not = icmp eq i64 %iv.next, 1024
+ br i1 %cmp.not, label %exit, label %for.body
+
+exit:
+ %retval = phi i64 [ %iv, %for.body ], [ 1024, %for.inc ]
+ ret i64 %retval
+}
More information about the llvm-commits
mailing list