[llvm] [VectorCombine] foldPermuteOfBinops - support multi-use binary ops and operands in shuffle folding (PR #173153)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 22 09:42:32 PST 2025


https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/173153

>From 118be695ca4e852be9da0d02269c8bf2b6839ff0 Mon Sep 17 00:00:00 2001
From: Milos Poletanovic <mpoletanovic at syrmia.com>
Date: Sat, 20 Dec 2025 15:39:09 +0100
Subject: [PATCH 1/6] [VectorCombine] Support multi-use binary ops and operands
 in shuffle folding

This patch extends VectorCombine to fold binary operations through
shuffles in scenarios involving multiple uses of both the binary
operator and its operands.
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 33 +++++++++------
 .../X86/shuffle-of-binops-mutliuses.ll        | 42 +++++++++++++++++++
 2 files changed, 62 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops-mutliuses.ll

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 243f685cf25e2..c90f75e6247bf 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2280,7 +2280,7 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
   BinaryOperator *BinOp;
   ArrayRef<int> OuterMask;
   if (!match(&I,
-             m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask))))
+             m_Shuffle(m_BinOp(BinOp), m_Undef(), m_Mask(OuterMask))))
     return false;
 
   // Don't introduce poison into div/rem.
@@ -2291,10 +2291,10 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
   ArrayRef<int> Mask0, Mask1;
   bool Match0 =
       match(BinOp->getOperand(0),
-            m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
+            m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)));
   bool Match1 =
       match(BinOp->getOperand(1),
-            m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
+            m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)));
   if (!Match0 && !Match1)
     return false;
 
@@ -2338,20 +2338,27 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
   bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
       all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
       ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
-
+  
+  bool WillRemoveBinOp = BinOp->hasOneUse();
   // Try to merge shuffles across the binop if the new shuffles are not costly.
   InstructionCost OldCost =
-      TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleDstTy,
                          BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
-  if (Match0)
-    OldCost += TTI.getShuffleCost(
-        TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
-        0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
-  if (Match1)
-    OldCost += TTI.getShuffleCost(
-        TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
-        0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
+  if (WillRemoveBinOp) {
+    OldCost += TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind);
+    bool WillRemoveShuf0 = BinOp->getOperand(0)->hasOneUse();
+    bool WillRemoveShuf1 = BinOp->getOperand(1)->hasOneUse();
+    if (Match0 && WillRemoveShuf0)
+        OldCost +=
+            TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
+                                Op0Ty, Mask0, CostKind, 0, nullptr, {Op00, Op01},
+                                cast<Instruction>(BinOp->getOperand(0)));
+    if (Match1 && WillRemoveShuf1)
+        OldCost +=
+            TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
+                                Op1Ty, Mask1, CostKind, 0, nullptr, {Op10, Op11},
+                                cast<Instruction>(BinOp->getOperand(1)));
+  }
 
   InstructionCost NewCost =
       TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops-mutliuses.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops-mutliuses.ll
new file mode 100644
index 0000000000000..ade3b88101ef2
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops-mutliuses.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=vector-combine -S -mtriple=x86_64-- -mcpu=haswell < %s | FileCheck %s
+
+declare void @use_v32i8(<32 x i8>)
+
+define <32 x i8> @max_expense_multi_use(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: define <32 x i8> @max_expense_multi_use(
+; CHECK-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[A1:%.*]] = shufflevector <32 x i8> [[A]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[B1:%.*]] = shufflevector <32 x i8> [[B]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[OP:%.*]] = add <32 x i8> [[A1]], [[B1]]
+; CHECK-NEXT:    call void @use_v32i8(<32 x i8> [[OP]])
+; CHECK-NEXT:    [[POST:%.*]] = add <32 x i8> [[A]], [[B]]
+; CHECK-NEXT:    ret <32 x i8> [[POST]]
+;
+
+  %a1 = shufflevector <32 x i8> %a, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %b1 = shufflevector <32 x i8> %b, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %op = add <32 x i8> %a1, %b1
+  call void @use_v32i8(<32 x i8> %op)
+  %post = shufflevector <32 x i8> %op, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <32 x i8> %post
+}
+
+declare void @use_v4f64(<4 x double>)
+
+define <4 x double> @fadd_v4f64_multiuse_shuffle_triggers(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle_triggers(
+; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[NEW_B_SHUF:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[POST:%.*]] = fadd <4 x double> [[A]], [[NEW_B_SHUF]]
+; CHECK-NEXT:    call void @use_v4f64(<4 x double> [[A1]])
+; CHECK-NEXT:    ret <4 x double> [[POST]]
+;
+  %a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %b1 = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+  %op = fadd <4 x double> %a1, %b1
+  %post = shufflevector <4 x double> %op, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  call void @use_v4f64(<4 x double> %a1)
+  ret <4 x double> %post
+}

>From f16ee27cbc3cfcdcb0334593cae06976f0f58e5f Mon Sep 17 00:00:00 2001
From: Milos Poletanovic <mpoletanovic at syrmia.com>
Date: Sat, 20 Dec 2025 17:05:05 +0100
Subject: [PATCH 2/6] Format code.

---
 .../Transforms/Vectorize/VectorCombine.cpp    | 29 +++++++++----------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index c90f75e6247bf..f10868d043b22 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2279,8 +2279,7 @@ bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
 bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
   BinaryOperator *BinOp;
   ArrayRef<int> OuterMask;
-  if (!match(&I,
-             m_Shuffle(m_BinOp(BinOp), m_Undef(), m_Mask(OuterMask))))
+  if (!match(&I, m_Shuffle(m_BinOp(BinOp), m_Undef(), m_Mask(OuterMask))))
     return false;
 
   // Don't introduce poison into div/rem.
@@ -2289,12 +2288,10 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
 
   Value *Op00, *Op01, *Op10, *Op11;
   ArrayRef<int> Mask0, Mask1;
-  bool Match0 =
-      match(BinOp->getOperand(0),
-            m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)));
-  bool Match1 =
-      match(BinOp->getOperand(1),
-            m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)));
+  bool Match0 = match(BinOp->getOperand(0),
+                      m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)));
+  bool Match1 = match(BinOp->getOperand(1),
+                      m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)));
   if (!Match0 && !Match1)
     return false;
 
@@ -2349,15 +2346,15 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
     bool WillRemoveShuf0 = BinOp->getOperand(0)->hasOneUse();
     bool WillRemoveShuf1 = BinOp->getOperand(1)->hasOneUse();
     if (Match0 && WillRemoveShuf0)
-        OldCost +=
-            TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
-                                Op0Ty, Mask0, CostKind, 0, nullptr, {Op00, Op01},
-                                cast<Instruction>(BinOp->getOperand(0)));
+      OldCost +=
+          TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
+                             Op0Ty, Mask0, CostKind, 0, nullptr, {Op00, Op01},
+                             cast<Instruction>(BinOp->getOperand(0)));
     if (Match1 && WillRemoveShuf1)
-        OldCost +=
-            TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
-                                Op1Ty, Mask1, CostKind, 0, nullptr, {Op10, Op11},
-                                cast<Instruction>(BinOp->getOperand(1)));
+      OldCost +=
+          TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
+                             Op1Ty, Mask1, CostKind, 0, nullptr, {Op10, Op11},
+                             cast<Instruction>(BinOp->getOperand(1)));
   }
 
   InstructionCost NewCost =

>From 7f81bf1ad44f771cc3cbcb0f8fb207f7cf361238 Mon Sep 17 00:00:00 2001
From: Milos Poletanovic <mpoletanovic at syrmia.com>
Date: Sat, 20 Dec 2025 17:12:24 +0100
Subject: [PATCH 3/6] Format code2.

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index f10868d043b22..f5df35bf59c35 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2335,7 +2335,7 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
   bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
       all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
       ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
-  
+
   bool WillRemoveBinOp = BinOp->hasOneUse();
   // Try to merge shuffles across the binop if the new shuffles are not costly.
   InstructionCost OldCost =

>From 40a24ef7b09240df9332a5e49c2372068ce02c7c Mon Sep 17 00:00:00 2001
From: Milos Poletanovic <mpoletanovic at syrmia.com>
Date: Mon, 22 Dec 2025 12:58:15 +0100
Subject: [PATCH 4/6] Addressed comments.

---
 .../Transforms/Vectorize/VectorCombine.cpp    | 44 +++++++++++--------
 .../VectorCombine/X86/permute-of-binops.ll    | 22 +++++++++-
 .../X86/shuffle-of-binops-mutliuses.ll        | 42 ------------------
 3 files changed, 45 insertions(+), 63 deletions(-)
 delete mode 100644 llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops-mutliuses.ll

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index f5df35bf59c35..a5c8fd753bf79 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2336,29 +2336,35 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
       all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
       ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
 
-  bool WillRemoveBinOp = BinOp->hasOneUse();
+  InstructionCost NewCost = 0;
   // Try to merge shuffles across the binop if the new shuffles are not costly.
+  InstructionCost BinOpCost =
+      TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind);
   InstructionCost OldCost =
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleDstTy,
-                         BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
-  if (WillRemoveBinOp) {
-    OldCost += TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind);
-    bool WillRemoveShuf0 = BinOp->getOperand(0)->hasOneUse();
-    bool WillRemoveShuf1 = BinOp->getOperand(1)->hasOneUse();
-    if (Match0 && WillRemoveShuf0)
-      OldCost +=
-          TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
-                             Op0Ty, Mask0, CostKind, 0, nullptr, {Op00, Op01},
-                             cast<Instruction>(BinOp->getOperand(0)));
-    if (Match1 && WillRemoveShuf1)
-      OldCost +=
-          TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
-                             Op1Ty, Mask1, CostKind, 0, nullptr, {Op10, Op11},
-                             cast<Instruction>(BinOp->getOperand(1)));
+      BinOpCost + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                     ShuffleDstTy, BinOpTy, OuterMask, CostKind,
+                                     0, nullptr, {BinOp}, &I);
+  if (!BinOp->hasOneUse())
+    NewCost += BinOpCost;
+
+  if (Match0) {
+    InstructionCost Shuf0Cost = TTI.getShuffleCost(
+        TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
+        0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
+    OldCost += Shuf0Cost;
+    if (!BinOp->getOperand(0)->hasOneUse())
+      NewCost += Shuf0Cost;
+  }
+  if (Match1) {
+    InstructionCost Shuf1Cost = TTI.getShuffleCost(
+        TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
+        0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
+    OldCost += Shuf1Cost;
+    if (!BinOp->getOperand(1)->hasOneUse())
+      NewCost += Shuf1Cost;
   }
 
-  InstructionCost NewCost =
-      TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
+  NewCost += TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
 
   if (!IsIdentity0)
     NewCost +=
diff --git a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
index b3de3b8f1ca62..76f89a40ef8a9 100644
--- a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
@@ -64,14 +64,15 @@ define <4 x float> @fadd_v4f32_mixed_types(<4 x float> %a0) {
   ret <4 x float> %post
 }
 
-; Negative test - multiple use of fadd
 define <4 x double> @fadd_v4f64_multiuse_op(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: define <4 x double> @fadd_v4f64_multiuse_op(
 ; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[B1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    [[OP:%.*]] = fadd <4 x double> [[A1]], [[B1]]
-; CHECK-NEXT:    [[POST:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    call void @use_v4f64(<4 x double> [[OP]])
 ; CHECK-NEXT:    ret <4 x double> [[POST]]
 ;
@@ -101,6 +102,23 @@ define <4 x double> @fadd_v4f64_multiuse_shuffle(<4 x double> %a, <4 x double> %
   ret <4 x double> %post
 }
 
+define <4 x double> @fadd_v4f64_multiuse_shuffle_triggers(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle_triggers(
+; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[POST:%.*]] = fadd <4 x double> [[A]], [[TMP1]]
+; CHECK-NEXT:    call void @use_v4f64(<4 x double> [[A1]])
+; CHECK-NEXT:    ret <4 x double> [[POST]]
+;
+  %a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %b1 = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+  %op = fadd <4 x double> %a1, %b1
+  %post = shufflevector <4 x double> %op, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  call void @use_v4f64(<4 x double> %a1)
+  ret <4 x double> %post
+}
+
 define <4 x i32> @sdiv_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: define <4 x i32> @sdiv_v4i32(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops-mutliuses.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops-mutliuses.ll
deleted file mode 100644
index ade3b88101ef2..0000000000000
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops-mutliuses.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -passes=vector-combine -S -mtriple=x86_64-- -mcpu=haswell < %s | FileCheck %s
-
-declare void @use_v32i8(<32 x i8>)
-
-define <32 x i8> @max_expense_multi_use(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-LABEL: define <32 x i8> @max_expense_multi_use(
-; CHECK-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[A1:%.*]] = shufflevector <32 x i8> [[A]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[B1:%.*]] = shufflevector <32 x i8> [[B]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[OP:%.*]] = add <32 x i8> [[A1]], [[B1]]
-; CHECK-NEXT:    call void @use_v32i8(<32 x i8> [[OP]])
-; CHECK-NEXT:    [[POST:%.*]] = add <32 x i8> [[A]], [[B]]
-; CHECK-NEXT:    ret <32 x i8> [[POST]]
-;
-
-  %a1 = shufflevector <32 x i8> %a, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %b1 = shufflevector <32 x i8> %b, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %op = add <32 x i8> %a1, %b1
-  call void @use_v32i8(<32 x i8> %op)
-  %post = shufflevector <32 x i8> %op, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  ret <32 x i8> %post
-}
-
-declare void @use_v4f64(<4 x double>)
-
-define <4 x double> @fadd_v4f64_multiuse_shuffle_triggers(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle_triggers(
-; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[NEW_B_SHUF:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[POST:%.*]] = fadd <4 x double> [[A]], [[NEW_B_SHUF]]
-; CHECK-NEXT:    call void @use_v4f64(<4 x double> [[A1]])
-; CHECK-NEXT:    ret <4 x double> [[POST]]
-;
-  %a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  %b1 = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-  %op = fadd <4 x double> %a1, %b1
-  %post = shufflevector <4 x double> %op, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  call void @use_v4f64(<4 x double> %a1)
-  ret <4 x double> %post
-}

>From 79eff3761381316a4c04247678bd26a5f7f00e42 Mon Sep 17 00:00:00 2001
From: Milos Poletanovic <mpoletanovic at syrmia.com>
Date: Mon, 22 Dec 2025 16:22:24 +0100
Subject: [PATCH 5/6] Fixed new cost and test.

---
 .../Transforms/Vectorize/VectorCombine.cpp    |  4 +--
 .../VectorCombine/X86/permute-of-binops.ll    | 25 ++++++++++++++++---
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index a5c8fd753bf79..d9e9aa4f7364e 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2352,7 +2352,7 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
         TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
         0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
     OldCost += Shuf0Cost;
-    if (!BinOp->getOperand(0)->hasOneUse())
+    if (!BinOp->hasOneUse() || !BinOp->getOperand(0)->hasOneUse())
       NewCost += Shuf0Cost;
   }
   if (Match1) {
@@ -2360,7 +2360,7 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
         TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
         0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
     OldCost += Shuf1Cost;
-    if (!BinOp->getOperand(1)->hasOneUse())
+    if (!BinOp->hasOneUse() || !BinOp->getOperand(1)->hasOneUse())
       NewCost += Shuf1Cost;
   }
 
diff --git a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
index 76f89a40ef8a9..c37a4952317ba 100644
--- a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=haswell | FileCheck %s --check-prefixes=HASWELL
 
 ; Fold "shuffle (binop (shuffle, shuffle)), undef" --> "binop (shuffle), (shuffle)"
 
@@ -70,9 +71,7 @@ define <4 x double> @fadd_v4f64_multiuse_op(<4 x double> %a, <4 x double> %b) {
 ; CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[B1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    [[OP:%.*]] = fadd <4 x double> [[A1]], [[B1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[POST:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    call void @use_v4f64(<4 x double> [[OP]])
 ; CHECK-NEXT:    ret <4 x double> [[POST]]
 ;
@@ -102,6 +101,26 @@ define <4 x double> @fadd_v4f64_multiuse_shuffle(<4 x double> %a, <4 x double> %
   ret <4 x double> %post
 }
 
+declare void @use_v32i8(<32 x i8>)
+define <32 x i8> @max_expense_multi_use_triggered(<32 x i8> %a, <32 x i8> %b) {
+; HASWELL-LABEL: define <32 x i8> @max_expense_multi_use_triggered(
+; HASWELL-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; HASWELL-NEXT:    [[A1:%.*]] = shufflevector <32 x i8> [[A]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; HASWELL-NEXT:    [[B1:%.*]] = shufflevector <32 x i8> [[B]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; HASWELL-NEXT:    [[OP:%.*]] = add <32 x i8> [[A1]], [[B1]]
+; HASWELL-NEXT:    call void @use_v32i8(<32 x i8> [[OP]])
+; HASWELL-NEXT:    [[POST:%.*]] = add <32 x i8> [[A]], [[B]]
+; HASWELL-NEXT:    ret <32 x i8> [[POST]]
+;
+
+  %a1 = shufflevector <32 x i8> %a, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %b1 = shufflevector <32 x i8> %b, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %op = add <32 x i8> %a1, %b1
+  call void @use_v32i8(<32 x i8> %op)
+  %post = shufflevector <32 x i8> %op, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <32 x i8> %post
+}
+
 define <4 x double> @fadd_v4f64_multiuse_shuffle_triggers(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle_triggers(
 ; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {

>From 15e27941eaffd0cf3403f9d61f58d2a9670ccfc9 Mon Sep 17 00:00:00 2001
From: Milos Poletanovic <mpoletanovic at syrmia.com>
Date: Mon, 22 Dec 2025 17:29:57 +0100
Subject: [PATCH 6/6] Changed test.

---
 .../VectorCombine/X86/permute-of-binops.ll     | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
index c37a4952317ba..5373f6c07be31 100644
--- a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=haswell | FileCheck %s --check-prefixes=HASWELL
 
 ; Fold "shuffle (binop (shuffle, shuffle)), undef" --> "binop (shuffle), (shuffle)"
 
@@ -103,16 +102,15 @@ define <4 x double> @fadd_v4f64_multiuse_shuffle(<4 x double> %a, <4 x double> %
 
 declare void @use_v32i8(<32 x i8>)
 define <32 x i8> @max_expense_multi_use_triggered(<32 x i8> %a, <32 x i8> %b) {
-; HASWELL-LABEL: define <32 x i8> @max_expense_multi_use_triggered(
-; HASWELL-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; HASWELL-NEXT:    [[A1:%.*]] = shufflevector <32 x i8> [[A]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; HASWELL-NEXT:    [[B1:%.*]] = shufflevector <32 x i8> [[B]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; HASWELL-NEXT:    [[OP:%.*]] = add <32 x i8> [[A1]], [[B1]]
-; HASWELL-NEXT:    call void @use_v32i8(<32 x i8> [[OP]])
-; HASWELL-NEXT:    [[POST:%.*]] = add <32 x i8> [[A]], [[B]]
-; HASWELL-NEXT:    ret <32 x i8> [[POST]]
+; CHECK-LABEL: define <32 x i8> @max_expense_multi_use_triggered(
+; CHECK-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[A1:%.*]] = shufflevector <32 x i8> [[A]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[B1:%.*]] = shufflevector <32 x i8> [[B]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[OP:%.*]] = add <32 x i8> [[A1]], [[B1]]
+; CHECK-NEXT:    call void @use_v32i8(<32 x i8> [[OP]])
+; CHECK-NEXT:    [[POST:%.*]] = add <32 x i8> [[A]], [[B]]
+; CHECK-NEXT:    ret <32 x i8> [[POST]]
 ;
-
   %a1 = shufflevector <32 x i8> %a, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   %b1 = shufflevector <32 x i8> %b, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   %op = add <32 x i8> %a1, %b1



More information about the llvm-commits mailing list