[llvm] [VectorCombine] foldShuffleOfBinops - if both operands are the same don't duplicate the total new cost (PR #172719)

Wed Dec 17 11:08:13 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Simon Pilgrim (RKSimon)

<details>
<summary>Changes</summary>

If we're shuffling/concatenating the same operands then ensure we don't duplicate the total cost, ensure we reuse the final shuffle and recognise that we reduce the total instruction count (so fold even when NewCost == OldCost, not just NewCost < OldCost).

---
Full diff: https://github.com/llvm/llvm-project/pull/172719.diff


2 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/VectorCombine.cpp (+9-6) 
- (modified) llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll (+6-14) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 695e97dec24aa..9239cb1b989b2 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2491,14 +2491,16 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
   ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
   ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
   ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
+  bool SingleSrcBinOp = (X == Y) && (Z == W) && (NewMask0 == NewMask1);
+  ReducedInstCount |= SingleSrcBinOp;
 
   auto *ShuffleCmpTy =
       FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
-  InstructionCost NewCost =
-      TTI.getShuffleCost(SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0,
-                         nullptr, {X, Z}) +
-      TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1, CostKind, 0,
-                         nullptr, {Y, W});
+  InstructionCost NewCost = TTI.getShuffleCost(
+      SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z});
+  if (!SingleSrcBinOp)
+    NewCost += TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1,
+                                  CostKind, 0, nullptr, {Y, W});
 
   if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
     NewCost +=
@@ -2520,7 +2522,8 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
     return false;
 
   Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
-  Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
+  Value *Shuf1 =
+      SingleSrcBinOp ? Shuf0 : Builder.CreateShuffleVector(Y, W, NewMask1);
   Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
                      ? Builder.CreateBinOp(
                            cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll
index 8c66684dfe137..40f55effcc881 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll
@@ -59,22 +59,14 @@ define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x
   ret <4 x float> %r
 }
 
-; FIXME: For repeated ops, don't repeat the shuffle cost.
+; For repeated ops, don't repeat the shuffle cost.
 
 define <8 x float> @shuf_fmul_v4f32_self_mul(<4 x float> %a0, <4 x float> %a1) {
-; SSE-LABEL: define <8 x float> @shuf_fmul_v4f32_self_mul(
-; SSE-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] {
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[R:%.*]] = fmul <8 x float> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    ret <8 x float> [[R]]
-;
-; AVX-LABEL: define <8 x float> @shuf_fmul_v4f32_self_mul(
-; AVX-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] {
-; AVX-NEXT:    [[M0:%.*]] = fmul <4 x float> [[A0]], [[A0]]
-; AVX-NEXT:    [[M1:%.*]] = fmul <4 x float> [[A1]], [[A1]]
-; AVX-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[M0]], <4 x float> [[M1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:    ret <8 x float> [[R]]
+; CHECK-LABEL: define <8 x float> @shuf_fmul_v4f32_self_mul(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fmul <8 x float> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    ret <8 x float> [[R]]
 ;
   %m0 = fmul <4 x float> %a0, %a0
   %m1 = fmul <4 x float> %a1, %a1

``````````

</details>


https://github.com/llvm/llvm-project/pull/172719