[llvm] [VectorCombine] foldShuffleOfBinops - if both operands are the same don't duplicate the total new cost (PR #172719)

Wed Dec 17 11:07:39 PST 2025

https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/172719

If we're shuffling/concatenating the same operands then ensure we don't duplicate the total cost, ensure we reuse the final shuffle and recognise that we reduce the total instruction count (so fold even when NewCost == OldCost, not just NewCost < OldCost).

>From 1b9f6609f0a118b46dec2f19886bef7cb8b4439c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 17 Dec 2025 19:06:24 +0000
Subject: [PATCH] [VectorCombine] foldShuffleOfBinops - if both operands are
 the same, don't duplicate the total new cost

If we're shuffling/concatenating the same operands then ensure we don't duplicate the total cost, ensure we reuse the final shuffle and recognise that we reduce the total instruction count (so fold even when NewCost == OldCost, not just NewCost < OldCost).
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 15 ++++++++------
 .../VectorCombine/X86/shuffle-of-binops.ll    | 20 ++++++-------------
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 695e97dec24aa..9239cb1b989b2 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2491,14 +2491,16 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
   ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
   ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
   ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
+  bool SingleSrcBinOp = (X == Y) && (Z == W) && (NewMask0 == NewMask1);
+  ReducedInstCount |= SingleSrcBinOp;
 
   auto *ShuffleCmpTy =
       FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
-  InstructionCost NewCost =
-      TTI.getShuffleCost(SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0,
-                         nullptr, {X, Z}) +
-      TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1, CostKind, 0,
-                         nullptr, {Y, W});
+  InstructionCost NewCost = TTI.getShuffleCost(
+      SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z});
+  if (!SingleSrcBinOp)
+    NewCost += TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1,
+                                  CostKind, 0, nullptr, {Y, W});
 
   if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
     NewCost +=
@@ -2520,7 +2522,8 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
     return false;
 
   Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
-  Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
+  Value *Shuf1 =
+      SingleSrcBinOp ? Shuf0 : Builder.CreateShuffleVector(Y, W, NewMask1);
   Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
                      ? Builder.CreateBinOp(
                            cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll
index 8c66684dfe137..40f55effcc881 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll
@@ -59,22 +59,14 @@ define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x
   ret <4 x float> %r
 }
 
-; FIXME: For repeated ops, don't repeat the shuffle cost.
+; For repeated ops, don't repeat the shuffle cost.
 
 define <8 x float> @shuf_fmul_v4f32_self_mul(<4 x float> %a0, <4 x float> %a1) {
-; SSE-LABEL: define <8 x float> @shuf_fmul_v4f32_self_mul(
-; SSE-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] {
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[R:%.*]] = fmul <8 x float> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    ret <8 x float> [[R]]
-;
-; AVX-LABEL: define <8 x float> @shuf_fmul_v4f32_self_mul(
-; AVX-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] {
-; AVX-NEXT:    [[M0:%.*]] = fmul <4 x float> [[A0]], [[A0]]
-; AVX-NEXT:    [[M1:%.*]] = fmul <4 x float> [[A1]], [[A1]]
-; AVX-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[M0]], <4 x float> [[M1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:    ret <8 x float> [[R]]
+; CHECK-LABEL: define <8 x float> @shuf_fmul_v4f32_self_mul(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fmul <8 x float> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    ret <8 x float> [[R]]
 ;
   %m0 = fmul <4 x float> %a0, %a0
   %m1 = fmul <4 x float> %a1, %a1