[llvm] 66d22b4 - [VectorCombine] fold shuffle-of-binops with common operand

Thu Oct 21 09:38:02 PDT 2021

Author: Sanjay Patel
Date: 2021-10-21T12:37:54-04:00
New Revision: 66d22b4da4afe00c695d9714687aac8b9e4b7396

URL: https://github.com/llvm/llvm-project/commit/66d22b4da4afe00c695d9714687aac8b9e4b7396
DIFF: https://github.com/llvm/llvm-project/commit/66d22b4da4afe00c695d9714687aac8b9e4b7396.diff

LOG: [VectorCombine] fold shuffle-of-binops with common operand

shuf (bo X, Y), (bo X, W) --> bo (shuf X), (shuf Y, W)

This is motivated by an example in D111800
(although that patch avoids the problem for that particular example).

The pattern is shown in reduced form with:
https://llvm.org/PR52178
https://alive2.llvm.org/ce/z/d8zB4D

There is no difference on the PhaseOrdering test from D111800
because the aarch64 cost model says that the shuffle cost is 3 while
the fadd cost is 2.

Differential Revision: https://reviews.llvm.org/D111901

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/VectorCombine/X86/shuffle.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index bb2528e5b04d6..57b11e9414bac 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -102,6 +102,7 @@ class VectorCombine {
   bool foldExtractedCmps(Instruction &I);
   bool foldSingleElementStore(Instruction &I);
   bool scalarizeLoadExtract(Instruction &I);
+  bool foldShuffleOfBinops(Instruction &I);
 
   void replaceValue(Value &Old, Value &New) {
     Old.replaceAllUsesWith(&New);
@@ -1065,6 +1066,60 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
   return true;
 }
 
+/// Try to convert "shuffle (binop), (binop)" with a shared binop operand into
+/// "binop (shuffle), (shuffle)".
+bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
+  auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
+  if (!VecTy)
+    return false;
+
+  BinaryOperator *B0, *B1;
+  ArrayRef<int> Mask;
+  if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)),
+                           m_Mask(Mask))) ||
+      B0->getOpcode() != B1->getOpcode() || B0->getType() != VecTy)
+    return false;
+
+  // Try to replace a binop with a shuffle if the shuffle is not costly.
+  // The new shuffle will choose from a single, common operand, so it may be
+  // cheaper than the existing two-operand shuffle.
+  SmallVector<int> UnaryMask = createUnaryMask(Mask, Mask.size());
+  Instruction::BinaryOps Opcode = B0->getOpcode();
+  InstructionCost BinopCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
+  InstructionCost ShufCost = TTI.getShuffleCost(
+      TargetTransformInfo::SK_PermuteSingleSrc, VecTy, UnaryMask);
+  if (ShufCost > BinopCost)
+    return false;
+
+  // If we have something like "add X, Y" and "add Z, X", swap ops to match.
+  Value *X = B0->getOperand(0), *Y = B0->getOperand(1);
+  Value *Z = B1->getOperand(0), *W = B1->getOperand(1);
+  if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W)
+    std::swap(X, Y);
+
+  Value *Shuf0, *Shuf1;
+  if (X == Z) {
+    // shuf (bo X, Y), (bo X, W) --> bo (shuf X), (shuf Y, W)
+    Shuf0 = Builder.CreateShuffleVector(X, UnaryMask);
+    Shuf1 = Builder.CreateShuffleVector(Y, W, Mask);
+  } else if (Y == W) {
+    // shuf (bo X, Y), (bo Z, Y) --> bo (shuf X, Z), (shuf Y)
+    Shuf0 = Builder.CreateShuffleVector(X, Z, Mask);
+    Shuf1 = Builder.CreateShuffleVector(Y, UnaryMask);
+  } else {
+    return false;
+  }
+
+  Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1);
+  // Intersect flags from the old binops.
+  if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
+    NewInst->copyIRFlags(B0);
+    NewInst->andIRFlags(B1);
+  }
+  replaceValue(I, *NewBO);
+  return true;
+}
+
 /// This is the entry point for all transforms. Pass manager 
diff erences are
 /// handled in the callers of this function.
 bool VectorCombine::run() {
@@ -1083,6 +1138,7 @@ bool VectorCombine::run() {
       MadeChange |= foldExtractExtract(I);
       MadeChange |= foldBitcastShuf(I);
       MadeChange |= foldExtractedCmps(I);
+      MadeChange |= foldShuffleOfBinops(I);
     }
     MadeChange |= scalarizeBinopOrCmp(I);
     MadeChange |= scalarizeLoadExtract(I);

diff  --git a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
index 14027a34f30bb..5d4dc6ae5d013 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
@@ -151,11 +151,13 @@ define <2 x i64> @PR35454_2(<2 x i64> %v) {
   ret <2 x i64> %bc3
 }
 
+; Shuffle is much cheaper than fdiv. FMF are intersected.
+
 define <4 x float> @shuf_fdiv_v4f32_yy(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
 ; CHECK-LABEL: @shuf_fdiv_v4f32_yy(
-; CHECK-NEXT:    [[B0:%.*]] = fdiv fast <4 x float> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[B1:%.*]] = fdiv arcp <4 x float> [[Z:%.*]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> [[Z:%.*]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = fdiv arcp <4 x float> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %b0 = fdiv fast <4 x float> %x, %y
@@ -164,11 +166,13 @@ define <4 x float> @shuf_fdiv_v4f32_yy(<4 x float> %x, <4 x float> %y, <4 x floa
   ret <4 x float> %r
 }
 
+; Common operand is op0 of the binops.
+
 define <4 x i32> @shuf_add_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: @shuf_add_v4i32_xx(
-; CHECK-NEXT:    [[B0:%.*]] = add <4 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[B1:%.*]] = add <4 x i32> [[X]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> <i32 undef, i32 undef, i32 6, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 undef, i32 undef, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> <i32 undef, i32 undef, i32 6, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %b0 = add <4 x i32> %x, %y
@@ -177,11 +181,13 @@ define <4 x i32> @shuf_add_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
   ret <4 x i32> %r
 }
 
+; For commutative instructions, common operand may be swapped.
+
 define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
 ; CHECK-LABEL: @shuf_fmul_v4f32_xx_swap(
-; CHECK-NEXT:    [[B0:%.*]] = fmul <4 x float> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[B1:%.*]] = fmul <4 x float> [[Z:%.*]], [[X]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %b0 = fmul <4 x float> %x, %y
@@ -190,11 +196,13 @@ define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x
   ret <4 x float> %r
 }
 
+; For commutative instructions, common operand may be swapped.
+
 define <2 x i64> @shuf_and_v2i64_yy_swap(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
 ; CHECK-LABEL: @shuf_and_v2i64_yy_swap(
-; CHECK-NEXT:    [[B0:%.*]] = and <2 x i64> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[B1:%.*]] = and <2 x i64> [[Y]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i64> [[B0]], <2 x i64> [[B1]], <2 x i32> <i32 3, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[Y:%.*]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[X:%.*]], <2 x i64> [[Z:%.*]], <2 x i32> <i32 3, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
 ;
   %b0 = and <2 x i64> %x, %y
@@ -203,11 +211,13 @@ define <2 x i64> @shuf_and_v2i64_yy_swap(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
   ret <2 x i64> %r
 }
 
+; non-commutative binop, but common op0
+
 define <4 x i32> @shuf_shl_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: @shuf_shl_v4i32_xx(
-; CHECK-NEXT:    [[B0:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[B1:%.*]] = shl <4 x i32> [[X]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
+; CHECK-NEXT:    [[R:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %b0 = shl <4 x i32> %x, %y
@@ -216,6 +226,8 @@ define <4 x i32> @shuf_shl_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
   ret <4 x i32> %r
 }
 
+; negative test - common operand, but not commutable
+
 define <4 x i32> @shuf_shl_v4i32_xx_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: @shuf_shl_v4i32_xx_swap(
 ; CHECK-NEXT:    [[B0:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]]
@@ -229,6 +241,8 @@ define <4 x i32> @shuf_shl_v4i32_xx_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
   ret <4 x i32> %r
 }
 
+; negative test - mismatched opcodes
+
 define <2 x i64> @shuf_sub_add_v2i64_yy(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
 ; CHECK-LABEL: @shuf_sub_add_v2i64_yy(
 ; CHECK-NEXT:    [[B0:%.*]] = sub <2 x i64> [[X:%.*]], [[Y:%.*]]
@@ -242,6 +256,8 @@ define <2 x i64> @shuf_sub_add_v2i64_yy(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z
   ret <2 x i64> %r
 }
 
+; negative test - type change via shuffle
+
 define <8 x float> @shuf_fmul_v4f32_xx_type(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
 ; CHECK-LABEL: @shuf_fmul_v4f32_xx_type(
 ; CHECK-NEXT:    [[B0:%.*]] = fmul <4 x float> [[X:%.*]], [[Y:%.*]]
@@ -255,6 +271,8 @@ define <8 x float> @shuf_fmul_v4f32_xx_type(<4 x float> %x, <4 x float> %y, <4 x
   ret <8 x float> %r
 }
 
+; negative test - uses
+
 define <4 x i32> @shuf_lshr_v4i32_yy_use1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: @shuf_lshr_v4i32_yy_use1(
 ; CHECK-NEXT:    [[B0:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]]
@@ -270,6 +288,8 @@ define <4 x i32> @shuf_lshr_v4i32_yy_use1(<4 x i32> %x, <4 x i32> %y, <4 x i32>
   ret <4 x i32> %r
 }
 
+; negative test - uses
+
 define <4 x i32> @shuf_mul_v4i32_yy_use2(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: @shuf_mul_v4i32_yy_use2(
 ; CHECK-NEXT:    [[B0:%.*]] = mul <4 x i32> [[X:%.*]], [[Y:%.*]]
@@ -285,6 +305,8 @@ define <4 x i32> @shuf_mul_v4i32_yy_use2(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
   ret <4 x i32> %r
 }
 
+; negative test - must have matching operand
+
 define <4 x float> @shuf_fadd_v4f32_no_common_op(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) {
 ; CHECK-LABEL: @shuf_fadd_v4f32_no_common_op(
 ; CHECK-NEXT:    [[B0:%.*]] = fadd <4 x float> [[X:%.*]], [[Y:%.*]]
@@ -298,6 +320,8 @@ define <4 x float> @shuf_fadd_v4f32_no_common_op(<4 x float> %x, <4 x float> %y,
   ret <4 x float> %r
 }
 
+; negative test - binops may be relatively cheap
+
 define <16 x i16> @shuf_and_v16i16_yy_expensive_shuf(<16 x i16> %x, <16 x i16> %y, <16 x i16> %z) {
 ; CHECK-LABEL: @shuf_and_v16i16_yy_expensive_shuf(
 ; CHECK-NEXT:    [[B0:%.*]] = and <16 x i16> [[X:%.*]], [[Y:%.*]]