[llvm] cd2caf6 - [LV] Simplify extract-lane with scalar operand to the scalar value itself. (#174534)

Sun Jan 11 18:03:48 PST 2026

Author: Elvis Wang
Date: 2026-01-12T10:03:44+08:00
New Revision: cd2caf658071157b35471f54a4afe2aa8f86b473

URL: https://github.com/llvm/llvm-project/commit/cd2caf658071157b35471f54a4afe2aa8f86b473
DIFF: https://github.com/llvm/llvm-project/commit/cd2caf658071157b35471f54a4afe2aa8f86b473.diff

LOG: [LV] Simplify extract-lane with scalar operand to the scalar value itself. (#174534)

This patch simplifies extract-lane(%lane_num, %X) to %X when %X is a
scalar value. Extracting from a scalar is redundant since there is only
one value to extract.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VPlan.h
    llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
    llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
    llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
    llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
    llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
    llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
    llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 83fe45bfd0bbf..4ee5214ca05ab 100644

--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1150,6 +1150,11 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
   /// underlying ingredient.
   bool doesGeneratePerAllLanes() const;
 
+  /// Return the number of operands determined by the opcode of the
+  /// VPInstruction. Returns -1u if the number of operands cannot be determined
+  /// directly by the opcode.
+  static unsigned getNumOperandsForOpcode(unsigned Opcode);
+
 private:
   typedef unsigned char OpcodeTy;
   OpcodeTy Opcode;
@@ -1166,13 +1171,6 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
   /// existing value is returned rather than a generated one.
   Value *generate(VPTransformState &State);
 
-#if !defined(NDEBUG)
-  /// Return the number of operands determined by the opcode of the
-  /// VPInstruction. Returns -1u if the number of operands cannot be determined
-  /// directly by the opcode.
-  static unsigned getNumOperandsForOpcode(unsigned Opcode);
-#endif
-
 public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
                 const VPIRFlags &Flags = {}, const VPIRMetadata &MD = {},

diff  --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 3732d009b9537..e22a90426b599 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -291,7 +291,8 @@ struct Recipe_match {
 
     if (R->getNumOperands() != std::tuple_size_v<Ops_t>) {
       [[maybe_unused]] auto *RepR = dyn_cast<VPReplicateRecipe>(R);
-      assert((Opcode == Instruction::PHI ||
+      assert(((isa<VPInstruction>(R) &&
+               VPInstruction::getNumOperandsForOpcode(Opcode) == -1u) ||
               (RepR && std::tuple_size_v<Ops_t> ==
                            RepR->getNumOperands() - RepR->isPredicated())) &&
              "non-variadic recipe with matched opcode does not have the "

diff  --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2c0772320c3cf..5716a6f82259e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -453,7 +453,6 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
   case VPInstruction::BranchOnCount:
   case VPInstruction::BranchOnTwoConds:
   case VPInstruction::ComputeReductionResult:
-  case VPInstruction::ExtractLane:
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::LogicalAnd:
   case VPInstruction::PtrAdd:
@@ -476,6 +475,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
   case VPInstruction::LastActiveLane:
   case VPInstruction::SLPLoad:
   case VPInstruction::SLPStore:
+  case VPInstruction::ExtractLane:
     // Cannot determine the number of operands from the opcode.
     return -1u;
   }

diff  --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a430f13f0c9c0..afa74937fc681 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1518,6 +1518,14 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
   if (!Plan->isUnrolled())
     return;
 
+  // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
+  // After unrolling, extract-lane may be used to extract values from multiple
+  // scalar sources. Only simplify when extracting from a single scalar source.
+  if (match(Def, m_ExtractLane(m_VPValue(), m_VPValue(A))) &&
+      vputils::isSingleScalar(A)) {
+    return Def->replaceAllUsesWith(A);
+  }
+
   // Hoist an invariant increment Y of a phi X, by having X start at Y.
   if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && isa<VPIRValue>(Y) &&
       isa<VPPhi>(X)) {

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
index 77b584655187c..3f9a17591a717 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
@@ -94,8 +94,6 @@ define i64 @vectorization_not_profitable_due_to_trunc(ptr dereferenceable(800) %
 ; CHECK-LABEL: LV: Checking a loop in 'vectorization_not_profitable_due_to_trunc'
 ; CHECK: LV: Selecting VF: 1.
 ; CHECK-NEXT: Calculating cost of work in exit block vector.early.exit:
-; CHECK-NEXT: Cost of 1 for VF 1: EMIT vp<%first.active.lane> = first-active-lane ir<%t>
-; CHECK-NEXT: Cost of 0 for VF 1: EMIT vp<%early.exit.value> = extract-lane vp<%first.active.lane>, ir<%l>
 ; CHECK: LV: Vectorization is possible but not beneficial.
 entry:
   br label %loop.header

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
index 549222cd919da..3f999a5cc8fc5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
@@ -242,14 +242,9 @@ define i64 @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP5]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 [[TMP8]]
 ; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i64 [[TMP12]]
+; CHECK-NEXT:    ret i64 [[V]]
 ;
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 16a20c86e38f0..adda7c362b8e8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -128,14 +128,9 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; SCALABLE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
-; SCALABLE-NEXT:    [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[TMP5]], 1
-; SCALABLE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP7]], 2
-; SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP11]], 0
-; SCALABLE-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 [[LAST_ACTIVE_LANE]]
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
-; SCALABLE-NEXT:    ret i64 [[TMP12]]
+; SCALABLE-NEXT:    ret i64 [[TMP6]]
 ;
 ; FIXEDLEN-LABEL: define i64 @uniform_load_outside_use(
 ; FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
@@ -192,14 +187,9 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; TF-SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
-; TF-SCALABLE-NEXT:    [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[TMP5]], 1
-; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
-; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 0
-; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 [[LAST_ACTIVE_LANE]]
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
-; TF-SCALABLE-NEXT:    ret i64 [[TMP12]]
+; TF-SCALABLE-NEXT:    ret i64 [[V]]
 ;
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
index 70a808f9b336e..e2ed2abcc81f8 100644
--- a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
@@ -46,14 +46,12 @@ define i64 @test1(i64 %y) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i64> splat (i64 3), [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP2]], <4 x i64> splat (i64 77), <4 x i64> [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], i64 77, i64 [[TMP3]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> <i1 false, i1 false, i1 false, i1 true>, i1 false)
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI]], i64 [[TMP4]]
 ; CHECK-NEXT:    br label [[COND_END:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret i64 [[TMP5]]
@@ -89,14 +87,11 @@ define i64 @test2(i64 %y) {
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[Y:%.*]], 0
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP1]], <4 x i64> splat (i64 77), <4 x i64> splat (i64 55)
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], i64 77, i64 55
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> <i1 false, i1 false, i1 false, i1 true>, i1 false)
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP2]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[PREDPHI]], i64 [[TMP3]]
 ; CHECK-NEXT:    br label [[COND_END:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret i64 [[TMP4]]