[llvm] [LV][NFC] Refactor code for extracting first active element (PR #131118)
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 14 03:24:05 PDT 2025
https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/131118
>From 8ed000d91f0183168aa4899706bb0c51b0763782 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 13 Mar 2025 09:57:01 +0000
Subject: [PATCH 1/3] [LV][NFC] Refactor code for extracting first active
element
Refactor the code to extract the first active element of a
vector in the early exit block, in preparation for PR #130766.
I've replaced the VPInstruction::ExtractFirstActive nodes with
a combination of a new VPInstruction::FirstActiveLane node and
a Instruction::ExtractElement node.
---
llvm/lib/Transforms/Vectorize/VPlan.h | 5 +--
.../Transforms/Vectorize/VPlanAnalysis.cpp | 5 ++-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 42 +++++++++++--------
.../Transforms/Vectorize/VPlanTransforms.cpp | 12 ++++--
.../LoopVectorize/AArch64/early_exit_costs.ll | 12 ++++--
5 files changed, 46 insertions(+), 30 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f78eb84b0c445..2205e87d2bc17 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -877,9 +877,8 @@ class VPInstruction : public VPRecipeWithIRFlags,
// Returns a scalar boolean value, which is true if any lane of its (only
// boolean) vector operand is true.
AnyOf,
- // Extracts the first active lane of a vector, where the first operand is
- // the predicate, and the second operand is the vector to extract.
- ExtractFirstActive,
+ // Calculates the first active lane index of the vector predicate operand.
+ FirstActiveLane,
};
private:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 9b0720760df40..e780e6934eb44 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -50,6 +50,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return SetResultTyFromOp();
switch (Opcode) {
+ case Instruction::ExtractElement:
+ return inferScalarType(R->getOperand(0));
case Instruction::Select: {
Type *ResTy = inferScalarType(R->getOperand(1));
VPValue *OtherV = R->getOperand(2);
@@ -82,7 +84,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::AnyOf:
return SetResultTyFromOp();
- case VPInstruction::ExtractFirstActive:
+ case VPInstruction::FirstActiveLane:
+ return Type::getIntNTy(Ctx, 64);
case VPInstruction::ExtractFromEnd: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 6e396eda6aac6..04df469f15987 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -468,6 +468,11 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *A = State.get(getOperand(0));
return Builder.CreateNot(A, Name);
}
+ case Instruction::ExtractElement: {
+ Value *Vec = State.get(getOperand(0));
+ Value *Idx = State.get(getOperand(1), true);
+ return Builder.CreateExtractElement(Vec, Idx, Name);
+ }
case Instruction::ICmp: {
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
@@ -723,12 +728,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *A = State.get(getOperand(0));
return Builder.CreateOrReduce(A);
}
- case VPInstruction::ExtractFirstActive: {
- Value *Vec = State.get(getOperand(0));
- Value *Mask = State.get(getOperand(1));
- Value *Ctz = Builder.CreateCountTrailingZeroElems(
- Builder.getInt64Ty(), Mask, true, "first.active.lane");
- return Builder.CreateExtractElement(Vec, Ctz, "early.exit.value");
+ case VPInstruction::FirstActiveLane: {
+ Value *Mask = State.get(getOperand(0));
+ return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
+ true, Name);
}
default:
llvm_unreachable("Unsupported opcode for instruction");
@@ -755,22 +758,24 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}
switch (getOpcode()) {
+ case Instruction::ExtractElement: {
+ // Add on the cost of extracting the element.
+ auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+ return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
+ Ctx.CostKind);
+ }
case VPInstruction::AnyOf: {
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
return Ctx.TTI.getArithmeticReductionCost(
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
}
- case VPInstruction::ExtractFirstActive: {
+ case VPInstruction::FirstActiveLane: {
// Calculate the cost of determining the lane index.
- auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(1)), VF);
+ auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
Type::getInt64Ty(Ctx.LLVMCtx),
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
- InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
- // Add on the cost of extracting the element.
- auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
- return Cost + Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
- Ctx.CostKind);
+ return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
case VPInstruction::FirstOrderRecurrenceSplice: {
assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
@@ -793,7 +798,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractFromEnd ||
- getOpcode() == VPInstruction::ExtractFirstActive ||
+ getOpcode() == Instruction::ExtractElement ||
+ getOpcode() == VPInstruction::FirstActiveLane ||
getOpcode() == VPInstruction::ComputeReductionResult ||
getOpcode() == VPInstruction::AnyOf;
}
@@ -853,13 +859,14 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
if (Instruction::isBinaryOp(getOpcode()))
return false;
switch (getOpcode()) {
+ case Instruction::ExtractElement:
case Instruction::ICmp:
case Instruction::Select:
case VPInstruction::AnyOf:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExtractFromEnd:
- case VPInstruction::ExtractFirstActive:
+ case VPInstruction::FirstActiveLane:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::Not:
@@ -970,7 +977,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::Broadcast:
O << "broadcast";
break;
-
case VPInstruction::ExtractFromEnd:
O << "extract-from-end";
break;
@@ -986,8 +992,8 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::AnyOf:
O << "any-of";
break;
- case VPInstruction::ExtractFirstActive:
- O << "extract-first-active";
+ case VPInstruction::FirstActiveLane:
+ O << "first-active-lane";
break;
default:
O << Instruction::getOpcodeName(getOpcode());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index dacd2c2b0070b..9aae383d35d91 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2158,10 +2158,14 @@ void VPlanTransforms::handleUncountableEarlyExit(
ExitIRI->extractLastLaneOfOperand(MiddleBuilder);
}
// Add the incoming value from the early exit.
- if (!IncomingFromEarlyExit->isLiveIn())
- IncomingFromEarlyExit =
- EarlyExitB.createNaryOp(VPInstruction::ExtractFirstActive,
- {IncomingFromEarlyExit, EarlyExitTakenCond});
+ if (!IncomingFromEarlyExit->isLiveIn()) {
+ VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
+ VPInstruction::FirstActiveLane, {EarlyExitTakenCond}, nullptr,
+ "first.active.lane");
+ IncomingFromEarlyExit = EarlyExitB.createNaryOp(
+ Instruction::ExtractElement, {IncomingFromEarlyExit, FirstActiveLane},
+ nullptr, "early.exit.value");
+ }
ExitIRI->addOperand(IncomingFromEarlyExit);
}
MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
index 55c6c43b6306a..a9c9af2c99932 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
@@ -11,8 +11,10 @@ define i64 @same_exit_block_pre_inc_use1_sve() #1 {
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_sve'
; CHECK: LV: Selecting VF: vscale x 16
; CHECK: Calculating cost of work in exit block vector.early.exit
-; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active
-; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active
+; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
+; CHECK-NEXT: Cost of 2 for VF vscale x 16: EMIT vp<{{.*}}> = extractelement ir<{{.*}}>, vp<{{.*}}>
+; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}>.1 = first-active-lane vp<{{.*}}>
+; CHECK-NEXT: Cost of 2 for VF vscale x 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<%first.active.lane>.1
; CHECK: LV: Minimum required TC for runtime checks to be profitable:32
entry:
%p1 = alloca [1024 x i8]
@@ -48,8 +50,10 @@ define i64 @same_exit_block_pre_inc_use1_nosve() {
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_nosve'
; CHECK: LV: Selecting VF: 16
; CHECK: Calculating cost of work in exit block vector.early.exit
-; CHECK-NEXT: Cost of 50 for VF 16: EMIT vp<{{.*}}> = extract-first-active
-; CHECK-NEXT: Cost of 50 for VF 16: EMIT vp<{{.*}}> = extract-first-active
+; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
+; CHECK-NEXT: Cost of 2 for VF 16: EMIT vp<{{.*}}> = extractelement ir<{{.*}}>, vp<{{.*}}>
+; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}>.1 = first-active-lane vp<{{.*}}>
+; CHECK-NEXT: Cost of 2 for VF 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<%first.active.lane>.1
; CHECK: LV: Minimum required TC for runtime checks to be profitable:176
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 176)
; CHECK-NEXT: LV: Too many memory checks needed.
>From b0c8a8446a573431ad1ae4a838108260666962b0 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Fri, 14 Mar 2025 08:05:32 +0000
Subject: [PATCH 2/3] Address review comments
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 5 ++++-
.../Transforms/LoopVectorize/AArch64/early_exit_costs.ll | 4 ++--
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 04df469f15987..445c0da36d249 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -469,8 +469,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
return Builder.CreateNot(A, Name);
}
case Instruction::ExtractElement: {
+ assert(State.VF.isVector() && "Only extract elements from vectors");
Value *Vec = State.get(getOperand(0));
- Value *Idx = State.get(getOperand(1), true);
+ Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
return Builder.CreateExtractElement(Vec, Idx, Name);
}
case Instruction::ICmp: {
@@ -887,6 +888,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
return false;
case Instruction::PHI:
return true;
+ case Instruction::ExtractElement:
+ return Op == getOperand(1);
case Instruction::ICmp:
case Instruction::Select:
case Instruction::Or:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
index a9c9af2c99932..4d7c5d088034d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
@@ -14,7 +14,7 @@ define i64 @same_exit_block_pre_inc_use1_sve() #1 {
; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
; CHECK-NEXT: Cost of 2 for VF vscale x 16: EMIT vp<{{.*}}> = extractelement ir<{{.*}}>, vp<{{.*}}>
; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}>.1 = first-active-lane vp<{{.*}}>
-; CHECK-NEXT: Cost of 2 for VF vscale x 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<%first.active.lane>.1
+; CHECK-NEXT: Cost of 2 for VF vscale x 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<{{.*}}>.1
; CHECK: LV: Minimum required TC for runtime checks to be profitable:32
entry:
%p1 = alloca [1024 x i8]
@@ -53,7 +53,7 @@ define i64 @same_exit_block_pre_inc_use1_nosve() {
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
; CHECK-NEXT: Cost of 2 for VF 16: EMIT vp<{{.*}}> = extractelement ir<{{.*}}>, vp<{{.*}}>
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}>.1 = first-active-lane vp<{{.*}}>
-; CHECK-NEXT: Cost of 2 for VF 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<%first.active.lane>.1
+; CHECK-NEXT: Cost of 2 for VF 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<{{.*}}>.1
; CHECK: LV: Minimum required TC for runtime checks to be profitable:176
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 176)
; CHECK-NEXT: LV: Too many memory checks needed.
>From 9f442f03696cb4729b243697d650db176e0cc5c6 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Fri, 14 Mar 2025 10:23:21 +0000
Subject: [PATCH 3/3] Address review comment
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 445c0da36d249..d97805d874955 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -886,10 +886,10 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
switch (getOpcode()) {
default:
return false;
- case Instruction::PHI:
- return true;
case Instruction::ExtractElement:
return Op == getOperand(1);
+ case Instruction::PHI:
+ return true;
case Instruction::ICmp:
case Instruction::Select:
case Instruction::Or:
More information about the llvm-commits
mailing list