[llvm] c950a72 - [VPlan] Support scalar VF for ExtractLane and FirstActiveLane.
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 25 13:45:37 PDT 2025
Author: Florian Hahn
Date: 2025-08-25T21:45:21+01:00
New Revision: c950a729749fc32a9431643d3d149bd833e01451
URL: https://github.com/llvm/llvm-project/commit/c950a729749fc32a9431643d3d149bd833e01451
DIFF: https://github.com/llvm/llvm-project/commit/c950a729749fc32a9431643d3d149bd833e01451.diff
LOG: [VPlan] Support scalar VF for ExtractLane and FirstActiveLane.
Extend ExtractLane and FirstActiveLane to support scalable VFs. This
allows correct handling when interleaving with VF = 1.
Alive2 proofs:
- Fixed codegen with this patch: https://alive2.llvm.org/ce/z/8Y5_Vc
(verifies as correct)
- Original codegen: https://alive2.llvm.org/ce/z/twdg3X (doesn't
verify)
Fixes https://github.com/llvm/llvm-project/issues/154967.
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
llvm/lib/Transforms/Vectorize/VPlanTransforms.h
llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-only.ll
llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a0f306c12754f..cc99386a4e660 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8634,8 +8634,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
return !CM.requiresScalarEpilogue(VF.isVector());
},
Range);
- VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit(),
- Range);
+ VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit());
VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck,
CM.foldTailByMasking());
@@ -8926,7 +8925,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
OrigLoop, *LI, Legal->getWidestInductionType(),
getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
VPlanTransforms::handleEarlyExits(*Plan,
- /*HasUncountableExit*/ false, Range);
+ /*HasUncountableExit*/ false);
VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true,
/*TailFolded*/ false);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 80b48de57b406..4a8b4b8d04840 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -538,8 +538,7 @@ VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy,
}
void VPlanTransforms::handleEarlyExits(VPlan &Plan,
- bool HasUncountableEarlyExit,
- VFRange &Range) {
+ bool HasUncountableEarlyExit) {
auto *MiddleVPBB = cast<VPBasicBlock>(
Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]);
auto *LatchVPBB = cast<VPBasicBlock>(MiddleVPBB->getSinglePredecessor());
@@ -559,8 +558,7 @@ void VPlanTransforms::handleEarlyExits(VPlan &Plan,
assert(!HandledUncountableEarlyExit &&
"can handle exactly one uncountable early exit");
handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan,
- cast<VPBasicBlock>(HeaderVPB), LatchVPBB,
- Range);
+ cast<VPBasicBlock>(HeaderVPB), LatchVPBB);
HandledUncountableEarlyExit = true;
} else {
for (VPRecipeBase &R : EB->phis())
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2f3ee1ff61d20..f804846f82cc5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -919,8 +919,15 @@ Value *VPInstruction::generate(VPTransformState &State) {
unsigned LastOpIdx = getNumOperands() - 1;
Value *Res = nullptr;
for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
- Value *TrailingZeros = Builder.CreateCountTrailingZeroElems(
- Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name);
+ Value *TrailingZeros =
+ State.VF.isScalar()
+ ? Builder.CreateZExt(
+ Builder.CreateICmpEQ(State.get(getOperand(Idx)),
+ Builder.getFalse()),
+ Builder.getInt64Ty())
+ : Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(),
+ State.get(getOperand(Idx)),
+ true, Name);
Value *Current = Builder.CreateAdd(
Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
if (Res) {
@@ -1029,6 +1036,12 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
switch (getOpcode()) {
case Instruction::ExtractElement:
case VPInstruction::ExtractLane: {
+ if (VF.isScalar()) {
+ // ExtractLane with VF=1 takes care of handling extracting across multiple
+ // parts.
+ return 0;
+ }
+
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
@@ -1040,8 +1053,13 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
}
case VPInstruction::FirstActiveLane: {
+ Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
+ if (VF.isScalar())
+ return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
+ CmpInst::makeCmpResultType(ScalarTy),
+ CmpInst::ICMP_EQ, Ctx.CostKind);
// Calculate the cost of determining the lane index.
- auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+ auto *PredTy = toVectorTy(ScalarTy, VF);
IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
Type::getInt64Ty(Ctx.LLVMCtx),
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e0bf241c73fdd..56175e7f18145 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2983,9 +2983,11 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
R->eraseFromParent();
}
-void VPlanTransforms::handleUncountableEarlyExit(
- VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan,
- VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VFRange &Range) {
+void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
+ VPBasicBlock *EarlyExitVPBB,
+ VPlan &Plan,
+ VPBasicBlock *HeaderVPBB,
+ VPBasicBlock *LatchVPBB) {
VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
if (!EarlyExitVPBB->getSinglePredecessor() &&
EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
@@ -3038,13 +3040,7 @@ void VPlanTransforms::handleUncountableEarlyExit(
}
VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
- auto IsVector = [](ElementCount VF) { return VF.isVector(); };
- // When the VFs are vectors, need to add `extract` to get the incoming value
- // from early exit. When the range contains scalar VF, limit the range to
- // scalar VF to prevent mis-compilation for the range containing both scalar
- // and vector VFs.
- if (!IncomingFromEarlyExit->isLiveIn() &&
- LoopVectorizationPlanner::getDecisionAndClampRange(IsVector, Range)) {
+ if (!IncomingFromEarlyExit->isLiveIn()) {
// Update the incoming value from the early exit.
VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 700b94621d5fb..0b7769642214d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -70,8 +70,8 @@ struct VPlanTransforms {
PredicatedScalarEvolution &PSE);
/// Update \p Plan to account for all early exits.
- LLVM_ABI_FOR_TEST static void
- handleEarlyExits(VPlan &Plan, bool HasUncountableExit, VFRange &Range);
+ LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan,
+ bool HasUncountableExit);
/// If a check is needed to guard executing the scalar epilogue loop, it will
/// be added to the middle block.
@@ -207,8 +207,7 @@ struct VPlanTransforms {
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
VPBasicBlock *EarlyExitVPBB,
VPlan &Plan, VPBasicBlock *HeaderVPBB,
- VPBasicBlock *LatchVPBB,
- VFRange &Range);
+ VPBasicBlock *LatchVPBB);
/// Replace loop regions with explicit CFG.
static void dissolveLoopRegions(VPlan &Plan);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
index 19ee763e6ffae..7ae50a5e4a075 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
@@ -94,6 +94,8 @@ define i64 @vectorization_not_profitable_due_to_trunc(ptr dereferenceable(800) %
; CHECK-LABEL: LV: Checking a loop in 'vectorization_not_profitable_due_to_trunc'
; CHECK: LV: Selecting VF: 1.
; CHECK-NEXT: Calculating cost of work in exit block vector.early.exit:
+; CHECK-NEXT: Cost of 1 for VF 1: EMIT vp<%first.active.lane> = first-active-lane ir<%t>
+; CHECK-NEXT: Cost of 0 for VF 1: EMIT vp<%early.exit.value> = extract-lane vp<%first.active.lane>, ir<%l>
; CHECK-NEXT: LV: Vectorization is possible but not beneficial.
entry:
br label %loop.header
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-only.ll
index 9f50b1eb187ad..3402d54ad40a7 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-only.ll
@@ -33,6 +33,28 @@ define i8 @iv_used_in_exit_with_math(i8 noundef %g) {
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[RETURN:.*]]
; CHECK: [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i1 [[TMP8]], false
+; CHECK-NEXT: [[TMP33:%.*]] = zext i1 [[TMP32]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 1, [[TMP33]]
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i1 [[TMP7]], false
+; CHECK-NEXT: [[TMP14:%.*]] = zext i1 [[TMP13]] to i64
+; CHECK-NEXT: [[TMP15:%.*]] = add i64 0, [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP14]], 1
+; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
+; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[INDEX]], [[TMP18]]
+; CHECK-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
+; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i1 [[TMP8]], false
+; CHECK-NEXT: [[TMP22:%.*]] = zext i1 [[TMP21]] to i64
+; CHECK-NEXT: [[TMP23:%.*]] = add i64 1, [[TMP22]]
+; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i1 [[TMP7]], false
+; CHECK-NEXT: [[TMP25:%.*]] = zext i1 [[TMP24]] to i64
+; CHECK-NEXT: [[TMP26:%.*]] = add i64 0, [[TMP25]]
+; CHECK-NEXT: [[TMP27:%.*]] = icmp ne i64 [[TMP25]], 1
+; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i64 [[TMP26]], i64 [[TMP23]]
+; CHECK-NEXT: [[TMP29:%.*]] = trunc i64 [[TMP28]] to i32
+; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[INDEX]], [[TMP29]]
+; CHECK-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i8
; CHECK-NEXT: br label %[[RETURN]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
@@ -47,8 +69,8 @@ define i8 @iv_used_in_exit_with_math(i8 noundef %g) {
; CHECK-NEXT: [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], 4
; CHECK-NEXT: br i1 [[EC]], label %[[RETURN]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[RETURN]]:
-; CHECK-NEXT: [[RES_IV1:%.*]] = phi i8 [ 32, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 32, %[[MIDDLE_BLOCK]] ], [ [[OFFSET_IDX]], %[[VECTOR_EARLY_EXIT]] ]
-; CHECK-NEXT: [[RES_IV2:%.*]] = phi i8 [ 0, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 0, %[[MIDDLE_BLOCK]] ], [ [[OFFSET_IDX]], %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT: [[RES_IV1:%.*]] = phi i8 [ 32, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 32, %[[MIDDLE_BLOCK]] ], [ [[TMP20]], %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT: [[RES_IV2:%.*]] = phi i8 [ 0, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 0, %[[MIDDLE_BLOCK]] ], [ [[TMP31]], %[[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: [[RES:%.*]] = add i8 [[RES_IV1]], [[RES_IV2]]
; CHECK-NEXT: ret i8 [[RES]]
;
@@ -102,6 +124,26 @@ define i32 @iv_used_in_exit_with_loads(ptr align 4 dereferenceable(128) %src) {
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[RETURN:.*]]
; CHECK: [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i1 [[TMP8]], false
+; CHECK-NEXT: [[TMP31:%.*]] = zext i1 [[TMP30]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 1, [[TMP31]]
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i1 [[TMP7]], false
+; CHECK-NEXT: [[TMP14:%.*]] = zext i1 [[TMP13]] to i64
+; CHECK-NEXT: [[TMP15:%.*]] = add i64 0, [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP14]], 1
+; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
+; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[INDEX]], [[TMP18]]
+; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i1 [[TMP8]], false
+; CHECK-NEXT: [[TMP21:%.*]] = zext i1 [[TMP20]] to i64
+; CHECK-NEXT: [[TMP22:%.*]] = add i64 1, [[TMP21]]
+; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i1 [[TMP7]], false
+; CHECK-NEXT: [[TMP24:%.*]] = zext i1 [[TMP23]] to i64
+; CHECK-NEXT: [[TMP25:%.*]] = add i64 0, [[TMP24]]
+; CHECK-NEXT: [[TMP26:%.*]] = icmp ne i64 [[TMP24]], 1
+; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[TMP25]], i64 [[TMP22]]
+; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT: [[TMP29:%.*]] = add i32 [[INDEX]], [[TMP28]]
; CHECK-NEXT: br label %[[RETURN]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
@@ -116,8 +158,8 @@ define i32 @iv_used_in_exit_with_loads(ptr align 4 dereferenceable(128) %src) {
; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 32
; CHECK-NEXT: br i1 [[EC]], label %[[RETURN]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[RETURN]]:
-; CHECK-NEXT: [[RES_IV1:%.*]] = phi i32 [ 32, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 32, %[[MIDDLE_BLOCK]] ], [ [[INDEX]], %[[VECTOR_EARLY_EXIT]] ]
-; CHECK-NEXT: [[RES_IV2:%.*]] = phi i32 [ 0, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 0, %[[MIDDLE_BLOCK]] ], [ [[INDEX]], %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT: [[RES_IV1:%.*]] = phi i32 [ 32, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 32, %[[MIDDLE_BLOCK]] ], [ [[TMP19]], %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT: [[RES_IV2:%.*]] = phi i32 [ 0, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 0, %[[MIDDLE_BLOCK]] ], [ [[TMP29]], %[[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: [[RES:%.*]] = add i32 [[RES_IV1]], [[RES_IV2]]
; CHECK-NEXT: ret i32 [[RES]]
;
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
index 56f685801151a..383f79bc87a45 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
@@ -75,8 +75,7 @@ class VPlanTestIRBase : public testing::Test {
auto Plan = VPlanTransforms::buildVPlan0(L, *LI, IntegerType::get(*Ctx, 64),
{}, PSE);
- VFRange R(ElementCount::getFixed(1), ElementCount::getFixed(2));
- VPlanTransforms::handleEarlyExits(*Plan, false, R);
+ VPlanTransforms::handleEarlyExits(*Plan, false);
VPlanTransforms::addMiddleCheck(*Plan, true, false);
VPlanTransforms::createLoopRegions(*Plan);
More information about the llvm-commits
mailing list