[llvm] [VPlan] Introduces explicit broadcast for live-in constants. (PR #133213)

Mon Jun 23 02:31:38 PDT 2025

https://github.com/ElvisWang123 updated https://github.com/llvm/llvm-project/pull/133213

>From f15d348dfd94bdfcf49fff9dc14e8fded2fc3308 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Wed, 26 Mar 2025 18:34:25 -0700
Subject: [PATCH] [VPlan] Introduces explicit broadcast for live-in constants.

This patch focus on explicit show the broadcast for the live-in
constants. This can help the VPlan-based cost model the broadcast cost
and track the register pressure of the broadcast value in the future.

Live-in constants usually only has single user so
insert the `broadcast` before the user to reduce the live range of the
broadcast value and prevent generated vector IR changes.
---
 llvm/lib/Transforms/Vectorize/VPlan.h         | 19 ++++++++++++++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  4 ++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 26 ++++++++++++++-----
 .../RISCV/riscv-vector-reverse.ll             | 14 +++++-----
 ...ze-force-tail-with-evl-inloop-reduction.ll |  6 +++--
 ...vectorize-force-tail-with-evl-reduction.ll |  6 +++--
 .../RISCV/vplan-vp-select-intrinsics.ll       |  3 ++-
 .../LoopVectorize/vplan-predicate-switch.ll   |  6 +++--
 8 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f4163b0743a9a..b4298bef7f98d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1339,6 +1339,17 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
 #endif
+
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    switch (Opcode) {
+    default:
+      return false;
+    case Instruction::ExtractValue:
+      return Op == getOperand(1);
+    }
+  }
 };
 
 /// VPWidenCastRecipe is a recipe to create vector cast instructions.
@@ -1533,6 +1544,14 @@ class VPWidenCallRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
 #endif
+
+  /// Returns true if the recipe only uses the first lane of operand \p Op.
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    // Scalar called fuction cannot be vectorized.
+    return Op == getOperand(getNumOperands() - 1);
+  }
 };
 
 /// A recipe representing a sequence of load -> update -> store as part of
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3e12fdf9163eb..c84016ba166f3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -938,6 +938,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   default:
     return false;
   case Instruction::ExtractElement:
+  case Instruction::ExtractValue:
     return Op == getOperand(1);
   case Instruction::PHI:
     return true;
@@ -959,8 +960,9 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   case VPInstruction::PtrAdd:
     return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
   case VPInstruction::ComputeAnyOfResult:
-  case VPInstruction::ComputeFindLastIVResult:
     return Op == getOperand(1);
+  case VPInstruction::ComputeFindLastIVResult:
+    return Op == getOperand(1) || Op == getOperand(2);
   };
   llvm_unreachable("switch should return");
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ac6be09ef271d..c5259c9ef4f2a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3073,10 +3073,7 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
 
   auto *VectorPreheader = Plan.getVectorPreheader();
   for (VPValue *VPV : VPValues) {
-    if (all_of(VPV->users(),
-               [VPV](VPUser *U) { return U->usesScalars(VPV); }) ||
-        (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
-         isa<Constant>(VPV->getLiveInIRValue())))
+    if (all_of(VPV->users(), [VPV](VPUser *U) { return U->usesScalars(VPV); }))
       continue;
 
     // Add explicit broadcast at the insert point that dominates all users.
@@ -3093,8 +3090,25 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
                "All users must be in the vector preheader or dominated by it");
     }
 
-    VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
-    auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
+    VPInstruction *Broadcast;
+    if (VPV->isLiveIn() && isa_and_nonnull<Constant>(VPV->getLiveInIRValue())) {
+      // We cannot replace the constant live-ins for PHIs by broadcast in the
+      // same VPBB because it will break PHI. Also cannot replace the
+      // VPWidenGEPRecipe since it broadcasts the generated pointer instead of
+      // operands.
+      if (auto *R = dyn_cast_if_present<VPRecipeBase>(*(VPV->users().begin()));
+          R && !isa<VPHeaderPHIRecipe, VPWidenPHIRecipe, VPWidenGEPRecipe>(R) &&
+          !VPV->hasMoreThanOneUniqueUser()) {
+        Broadcast = new VPInstruction(VPInstruction::Broadcast, {VPV});
+        // Insert just before the user to reduce register pressure.
+        Broadcast->insertBefore(R);
+      } else {
+        continue;
+      }
+    } else {
+      VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
+      Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
+    }
     VPV->replaceUsesWithIf(Broadcast,
                            [VPV, Broadcast](VPUser &U, unsigned Idx) {
                              return Broadcast != &U && !U.usesScalars(VPV);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index b23b0ce759d49..315c054a53a19 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -239,10 +239,11 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
 ; CHECK-NEXT:    vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
 ; CHECK-NEXT:    WIDEN ir<%19> = load vp<%4>
-; CHECK-NEXT:    WIDEN ir<%add9> = add ir<%19>, ir<1>
+; CHECK-NEXT:    EMIT vp<[[BROADCAST:%.+]]> = broadcast ir<1>
+; CHECK-NEXT:    WIDEN ir<%add9> = add ir<%19>, vp<[[BROADCAST]]>
 ; CHECK-NEXT:    CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:    vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
-; CHECK-NEXT:    WIDEN store vp<%5>, ir<%add9>
+; CHECK-NEXT:    vp<[[VECTOR_END_POINTER1:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
+; CHECK-NEXT:    WIDEN store vp<[[VECTOR_END_POINTER1]]>, ir<%add9>
 ; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
 ; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, ir<%n.vec>
 ; CHECK-NEXT:  Successor(s): middle.block, vector.body
@@ -648,10 +649,11 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
 ; CHECK-NEXT:    vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
 ; CHECK-NEXT:    WIDEN ir<%19> = load vp<%4>
-; CHECK-NEXT:    WIDEN ir<%conv1> = fadd ir<%19>, ir<1.000000e+00>
+; CHECK-NEXT:    EMIT vp<[[BROADCAST:%.+]]> = broadcast ir<1.000000e+00>
+; CHECK-NEXT:    WIDEN ir<%conv1> = fadd ir<%19>, vp<[[BROADCAST]]>
 ; CHECK-NEXT:    CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:    vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
-; CHECK-NEXT:    WIDEN store vp<%5>, ir<%conv1>
+; CHECK-NEXT:    vp<[[VECTOR_END_POINTER:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
+; CHECK-NEXT:    WIDEN store vp<[[VECTOR_END_POINTER]]>, ir<%conv1>
 ; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
 ; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, ir<%n.vec>
 ; CHECK-NEXT:  Successor(s): middle.block, vector.body
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
index 87ac697bf2026..8361548b934ff 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
@@ -1847,7 +1847,8 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
-; IF-EVL-NEXT:    [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP14]]
+; IF-EVL-NEXT:    [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP10]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -1967,7 +1968,8 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = fcmp fast olt <vscale x 4 x float> [[VP_OP_LOAD]], splat (float 3.000000e+00)
-; IF-EVL-NEXT:    [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP14]]
+; IF-EVL-NEXT:    [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP10]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
index 83deebe42d38f..7165de54dc252 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
@@ -1901,7 +1901,8 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
-; IF-EVL-NEXT:    [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
+; IF-EVL-NEXT:    [[TMP14:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP9]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -2021,7 +2022,8 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = fcmp fast olt <vscale x 4 x float> [[VP_OP_LOAD]], splat (float 3.000000e+00)
-; IF-EVL-NEXT:    [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
+; IF-EVL-NEXT:    [[TMP14:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP9]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
index 8c0e4da51f720..4a6a9584bb4ad 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
@@ -42,7 +42,8 @@
  ; IF-EVL-NEXT:   vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
  ; IF-EVL-NEXT:   WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
  ; IF-EVL-NEXT:   WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]>
- ; IF-EVL-NEXT:   WIDEN ir<[[SUB:%.+]]> = sub ir<0>, ir<[[LD2]]>
+ ; IF-EVL-NEXT:   EMIT vp<[[BROADCAST:%.+]]> = broadcast ir<0>
+ ; IF-EVL-NEXT:   WIDEN ir<[[SUB:%.+]]> = sub vp<[[BROADCAST]]>, ir<[[LD2]]>
  ; IF-EVL-NEXT:   WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<[[LD2]]>, ir<[[SUB]]>, vp<[[EVL]]>)
  ; IF-EVL-NEXT:   WIDEN ir<[[ADD:%.+]]> = add vp<[[SELECT]]>, ir<[[LD1]]>
  ; IF-EVL-NEXT:   CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[EVL_PHI]]>
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
index d441e4123975c..a21432c5b87e3 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
@@ -23,8 +23,10 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-NEXT:   EMIT vp<[[PTR:%.+]]> = ptradd ir<%start>, vp<[[STEPS]]>
 ; CHECK-NEXT:   vp<[[WIDE_PTR:%.+]]> = vector-pointer vp<[[PTR]]>
 ; CHECK-NEXT:   WIDEN ir<%l> = load vp<[[WIDE_PTR]]>
-; CHECK-NEXT:   EMIT vp<[[C1:%.+]]> = icmp eq ir<%l>, ir<-12>
-; CHECK-NEXT:   EMIT vp<[[C2:%.+]]> = icmp eq ir<%l>, ir<13>
+; CHECK-NEXT:   EMIT vp<[[BROADCAST1:%.+]]> = broadcast ir<-12>
+; CHECK-NEXT:   EMIT vp<[[C1:%.+]]> = icmp eq ir<%l>, vp<[[BROADCAST1]]>
+; CHECK-NEXT:   EMIT vp<[[BROADCAST2:%.+]]> = broadcast ir<13>
+; CHECK-NEXT:   EMIT vp<[[C2:%.+]]> = icmp eq ir<%l>, vp<[[BROADCAST2]]>
 ; CHECK-NEXT:   EMIT vp<[[OR_CASES:%.+]]> = or vp<[[C1]]>, vp<[[C2]]>
 ; CHECK-NEXT:   EMIT vp<[[DEFAULT_MASK:%.+]]> = not vp<[[OR_CASES]]>
 ; CHECK-NEXT: Successor(s): pred.store