[llvm] [VectorCombine] foldPermuteOfBinops - fold "shuffle (binop (shuffle, other)), undef" --> "binop (shuffle), (shuffle)". (PR #122118)

Wed Jan 8 06:44:42 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-vectorizers

Author: Simon Pilgrim (RKSimon)

<details>
<summary>Changes</summary>

foldPermuteOfBinops currently requires both binop operands to be oneuse shuffles to fold the shuffles across the binop, but there will be cases where its still profitable to fold across the binop with only one foldable shuffle.

---
Full diff: https://github.com/llvm/llvm-project/pull/122118.diff


4 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/VectorCombine.cpp (+25-19) 
- (modified) llvm/test/Transforms/PhaseOrdering/X86/hadd.ll (+7-7) 
- (modified) llvm/test/Transforms/PhaseOrdering/X86/hsub.ll (+7-7) 
- (modified) llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll (+18-12) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 120eafae8c5ac5..e9478103f36288 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1592,17 +1592,21 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
   if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
     return false;
 
-  Value *Op00, *Op01;
-  ArrayRef<int> Mask0;
-  if (!match(BinOp->getOperand(0),
-             m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)))))
+  Value *Op00, *Op01, *Op10, *Op11;
+  ArrayRef<int> Mask0, Mask1;
+  bool Match0 =
+      match(BinOp->getOperand(0),
+            m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
+  bool Match1 =
+      match(BinOp->getOperand(1),
+            m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
+  if (!Match0 && !Match1)
     return false;
 
-  Value *Op10, *Op11;
-  ArrayRef<int> Mask1;
-  if (!match(BinOp->getOperand(1),
-             m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)))))
-    return false;
+  Op00 = Match0 ? Op00 : BinOp->getOperand(0);
+  Op01 = Match0 ? Op01 : BinOp->getOperand(0);
+  Op10 = Match1 ? Op10 : BinOp->getOperand(1);
+  Op11 = Match1 ? Op11 : BinOp->getOperand(1);
 
   Instruction::BinaryOps Opcode = BinOp->getOpcode();
   auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
@@ -1620,15 +1624,15 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
       any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
     return false;
 
-  // Merge outer / inner shuffles.
+  // Merge outer / inner (or identity if no match) shuffles.
   SmallVector<int> NewMask0, NewMask1;
   for (int M : OuterMask) {
     if (M < 0 || M >= (int)NumSrcElts) {
       NewMask0.push_back(PoisonMaskElem);
       NewMask1.push_back(PoisonMaskElem);
     } else {
-      NewMask0.push_back(Mask0[M]);
-      NewMask1.push_back(Mask1[M]);
+      NewMask0.push_back(Match0 ? Mask0[M] : M);
+      NewMask1.push_back(Match1 ? Mask1[M] : M);
     }
   }
 
@@ -1636,13 +1640,15 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
   InstructionCost OldCost =
       TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, BinOpTy,
-                         OuterMask, CostKind, 0, nullptr, {BinOp}, &I) +
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, Mask0,
-                         CostKind, 0, nullptr, {Op00, Op01},
-                         cast<Instruction>(BinOp->getOperand(0))) +
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty, Mask1,
-                         CostKind, 0, nullptr, {Op10, Op11},
-                         cast<Instruction>(BinOp->getOperand(1)));
+                         OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
+  if (Match0)
+    OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty,
+                                  Mask0, CostKind, 0, nullptr, {Op00, Op01},
+                                  cast<Instruction>(BinOp->getOperand(0)));
+  if (Match1)
+    OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty,
+                                  Mask1, CostKind, 0, nullptr, {Op10, Op11},
+                                  cast<Instruction>(BinOp->getOperand(1)));
 
   InstructionCost NewCost =
       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, NewMask0,
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index a4aea02a335117..4863ea91803ad5 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -801,10 +801,10 @@ define <2 x double> @add_v2f64_01(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @add_v2f64_u1(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @add_v2f64_u1(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[B]], [[SHIFT]]
-; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    ret <2 x double> [[RESULT]]
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 1>
+; CHECK-NEXT:    [[RESULT1:%.*]] = fadd <2 x double> [[RESULT]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[RESULT1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -820,9 +820,9 @@ define <2 x double> @add_v2f64_u1(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @add_v2f64_0u(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @add_v2f64_0u(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[RESULT:%.*]] = fadd <2 x double> [[TMP1]], [[SHIFT]]
 ; CHECK-NEXT:    ret <2 x double> [[RESULT]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index bcb316a4a73ea6..4f67ee0cb18c04 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -801,10 +801,10 @@ define <2 x double> @sub_v2f64_01(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @sub_v2f64_u1(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[B]], [[SHIFT]]
-; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    ret <2 x double> [[RESULT]]
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 1>
+; CHECK-NEXT:    [[RESULT1:%.*]] = fsub <2 x double> [[RESULT]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[RESULT1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -820,9 +820,9 @@ define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @sub_v2f64_0u(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @sub_v2f64_0u(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[RESULT:%.*]] = fsub <2 x double> [[TMP1]], [[SHIFT]]
 ; CHECK-NEXT:    ret <2 x double> [[RESULT]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
diff --git a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
index 8db1990dcbb5d8..1dc324bbd63ff9 100644
--- a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
@@ -70,16 +70,25 @@ define <4 x double> @fadd_v4f64_multiuse_op(<4 x double> %a, <4 x double> %b) {
   ret <4 x double> %post
 }
 
-; Negative test - multiple use of inner shuffle
+; Negative test - multiple use of inner shuffle (only fold if the moved shuffle is cheaper).
 define <4 x double> @fadd_v4f64_multiuse_shuffle(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
-; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[B1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[OP:%.*]] = fadd <4 x double> [[A1]], [[B1]]
-; CHECK-NEXT:    [[POST:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    call void @use_v4f64(<4 x double> [[A1]])
-; CHECK-NEXT:    ret <4 x double> [[POST]]
+; SSE-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
+; SSE-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; SSE-NEXT:    [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    call void @use_v4f64(<4 x double> [[A1]])
+; SSE-NEXT:    ret <4 x double> [[POST]]
+;
+; AVX-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
+; AVX-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:    [[B1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; AVX-NEXT:    [[OP:%.*]] = fadd <4 x double> [[A1]], [[B1]]
+; AVX-NEXT:    [[POST:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; AVX-NEXT:    call void @use_v4f64(<4 x double> [[A1]])
+; AVX-NEXT:    ret <4 x double> [[POST]]
 ;
   %a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %b1 = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
@@ -137,6 +146,3 @@ define <4 x i32> @sdiv_v4i32_poison_idx(<4 x i32> %a, <4 x i32> %b) {
   %post = shufflevector <4 x i32> %op, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 4>
   ret <4 x i32> %post
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}

``````````

</details>


https://github.com/llvm/llvm-project/pull/122118