[llvm] [VPlan] Introduces explicit broadcast for live-in constants. (PR #133213)
Elvis Wang via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 23 02:31:38 PDT 2025
https://github.com/ElvisWang123 updated https://github.com/llvm/llvm-project/pull/133213
>From f15d348dfd94bdfcf49fff9dc14e8fded2fc3308 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Wed, 26 Mar 2025 18:34:25 -0700
Subject: [PATCH] [VPlan] Introduces explicit broadcast for live-in constants.
This patch focus on explicit show the broadcast for the live-in
constants. This can help the VPlan-based cost model the broadcast cost
and track the register pressure of the broadcast value in the future.
Live-in constants usually only has single user so
insert the `broadcast` before the user to reduce the live range of the
broadcast value and prevent generated vector IR changes.
---
llvm/lib/Transforms/Vectorize/VPlan.h | 19 ++++++++++++++
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 4 ++-
.../Transforms/Vectorize/VPlanTransforms.cpp | 26 ++++++++++++++-----
.../RISCV/riscv-vector-reverse.ll | 14 +++++-----
...ze-force-tail-with-evl-inloop-reduction.ll | 6 +++--
...vectorize-force-tail-with-evl-reduction.ll | 6 +++--
.../RISCV/vplan-vp-select-intrinsics.ll | 3 ++-
.../LoopVectorize/vplan-predicate-switch.ll | 6 +++--
8 files changed, 64 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f4163b0743a9a..b4298bef7f98d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1339,6 +1339,17 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
+
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ switch (Opcode) {
+ default:
+ return false;
+ case Instruction::ExtractValue:
+ return Op == getOperand(1);
+ }
+ }
};
/// VPWidenCastRecipe is a recipe to create vector cast instructions.
@@ -1533,6 +1544,14 @@ class VPWidenCallRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ // Scalar called fuction cannot be vectorized.
+ return Op == getOperand(getNumOperands() - 1);
+ }
};
/// A recipe representing a sequence of load -> update -> store as part of
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3e12fdf9163eb..c84016ba166f3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -938,6 +938,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
default:
return false;
case Instruction::ExtractElement:
+ case Instruction::ExtractValue:
return Op == getOperand(1);
case Instruction::PHI:
return true;
@@ -959,8 +960,9 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case VPInstruction::PtrAdd:
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
case VPInstruction::ComputeAnyOfResult:
- case VPInstruction::ComputeFindLastIVResult:
return Op == getOperand(1);
+ case VPInstruction::ComputeFindLastIVResult:
+ return Op == getOperand(1) || Op == getOperand(2);
};
llvm_unreachable("switch should return");
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ac6be09ef271d..c5259c9ef4f2a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3073,10 +3073,7 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
auto *VectorPreheader = Plan.getVectorPreheader();
for (VPValue *VPV : VPValues) {
- if (all_of(VPV->users(),
- [VPV](VPUser *U) { return U->usesScalars(VPV); }) ||
- (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
- isa<Constant>(VPV->getLiveInIRValue())))
+ if (all_of(VPV->users(), [VPV](VPUser *U) { return U->usesScalars(VPV); }))
continue;
// Add explicit broadcast at the insert point that dominates all users.
@@ -3093,8 +3090,25 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
"All users must be in the vector preheader or dominated by it");
}
- VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
- auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
+ VPInstruction *Broadcast;
+ if (VPV->isLiveIn() && isa_and_nonnull<Constant>(VPV->getLiveInIRValue())) {
+ // We cannot replace the constant live-ins for PHIs by broadcast in the
+ // same VPBB because it will break PHI. Also cannot replace the
+ // VPWidenGEPRecipe since it broadcasts the generated pointer instead of
+ // operands.
+ if (auto *R = dyn_cast_if_present<VPRecipeBase>(*(VPV->users().begin()));
+ R && !isa<VPHeaderPHIRecipe, VPWidenPHIRecipe, VPWidenGEPRecipe>(R) &&
+ !VPV->hasMoreThanOneUniqueUser()) {
+ Broadcast = new VPInstruction(VPInstruction::Broadcast, {VPV});
+ // Insert just before the user to reduce register pressure.
+ Broadcast->insertBefore(R);
+ } else {
+ continue;
+ }
+ } else {
+ VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
+ Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
+ }
VPV->replaceUsesWithIf(Broadcast,
[VPV, Broadcast](VPUser &U, unsigned Idx) {
return Broadcast != &U && !U.usesScalars(VPV);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index b23b0ce759d49..315c054a53a19 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -239,10 +239,11 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
; CHECK-NEXT: WIDEN ir<%19> = load vp<%4>
-; CHECK-NEXT: WIDEN ir<%add9> = add ir<%19>, ir<1>
+; CHECK-NEXT: EMIT vp<[[BROADCAST:%.+]]> = broadcast ir<1>
+; CHECK-NEXT: WIDEN ir<%add9> = add ir<%19>, vp<[[BROADCAST]]>
; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
-; CHECK-NEXT: WIDEN store vp<%5>, ir<%add9>
+; CHECK-NEXT: vp<[[VECTOR_END_POINTER1:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
+; CHECK-NEXT: WIDEN store vp<[[VECTOR_END_POINTER1]]>, ir<%add9>
; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec>
; CHECK-NEXT: Successor(s): middle.block, vector.body
@@ -648,10 +649,11 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
; CHECK-NEXT: WIDEN ir<%19> = load vp<%4>
-; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%19>, ir<1.000000e+00>
+; CHECK-NEXT: EMIT vp<[[BROADCAST:%.+]]> = broadcast ir<1.000000e+00>
+; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%19>, vp<[[BROADCAST]]>
; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
-; CHECK-NEXT: WIDEN store vp<%5>, ir<%conv1>
+; CHECK-NEXT: vp<[[VECTOR_END_POINTER:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
+; CHECK-NEXT: WIDEN store vp<[[VECTOR_END_POINTER]]>, ir<%conv1>
; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec>
; CHECK-NEXT: Successor(s): middle.block, vector.body
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
index 87ac697bf2026..8361548b934ff 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
@@ -1847,7 +1847,8 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
-; IF-EVL-NEXT: [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP14]]
+; IF-EVL-NEXT: [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -1967,7 +1968,8 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
; IF-EVL-NEXT: [[TMP14:%.*]] = fcmp fast olt <vscale x 4 x float> [[VP_OP_LOAD]], splat (float 3.000000e+00)
-; IF-EVL-NEXT: [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP14]]
+; IF-EVL-NEXT: [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
index 83deebe42d38f..7165de54dc252 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
@@ -1901,7 +1901,8 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; IF-EVL-NEXT: [[TMP13:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
-; IF-EVL-NEXT: [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP13]]
+; IF-EVL-NEXT: [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -2021,7 +2022,8 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; IF-EVL-NEXT: [[TMP13:%.*]] = fcmp fast olt <vscale x 4 x float> [[VP_OP_LOAD]], splat (float 3.000000e+00)
-; IF-EVL-NEXT: [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP13]]
+; IF-EVL-NEXT: [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
index 8c0e4da51f720..4a6a9584bb4ad 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
@@ -42,7 +42,8 @@
; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]>
- ; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = sub ir<0>, ir<[[LD2]]>
+ ; IF-EVL-NEXT: EMIT vp<[[BROADCAST:%.+]]> = broadcast ir<0>
+ ; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = sub vp<[[BROADCAST]]>, ir<[[LD2]]>
; IF-EVL-NEXT: WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<[[LD2]]>, ir<[[SUB]]>, vp<[[EVL]]>)
; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add vp<[[SELECT]]>, ir<[[LD1]]>
; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[EVL_PHI]]>
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
index d441e4123975c..a21432c5b87e3 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
@@ -23,8 +23,10 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
; CHECK-NEXT: EMIT vp<[[PTR:%.+]]> = ptradd ir<%start>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[WIDE_PTR:%.+]]> = vector-pointer vp<[[PTR]]>
; CHECK-NEXT: WIDEN ir<%l> = load vp<[[WIDE_PTR]]>
-; CHECK-NEXT: EMIT vp<[[C1:%.+]]> = icmp eq ir<%l>, ir<-12>
-; CHECK-NEXT: EMIT vp<[[C2:%.+]]> = icmp eq ir<%l>, ir<13>
+; CHECK-NEXT: EMIT vp<[[BROADCAST1:%.+]]> = broadcast ir<-12>
+; CHECK-NEXT: EMIT vp<[[C1:%.+]]> = icmp eq ir<%l>, vp<[[BROADCAST1]]>
+; CHECK-NEXT: EMIT vp<[[BROADCAST2:%.+]]> = broadcast ir<13>
+; CHECK-NEXT: EMIT vp<[[C2:%.+]]> = icmp eq ir<%l>, vp<[[BROADCAST2]]>
; CHECK-NEXT: EMIT vp<[[OR_CASES:%.+]]> = or vp<[[C1]]>, vp<[[C2]]>
; CHECK-NEXT: EMIT vp<[[DEFAULT_MASK:%.+]]> = not vp<[[OR_CASES]]>
; CHECK-NEXT: Successor(s): pred.store
More information about the llvm-commits
mailing list