[llvm] 66d22b4 - [VectorCombine] fold shuffle-of-binops with common operand
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 21 09:38:02 PDT 2021
Author: Sanjay Patel
Date: 2021-10-21T12:37:54-04:00
New Revision: 66d22b4da4afe00c695d9714687aac8b9e4b7396
URL: https://github.com/llvm/llvm-project/commit/66d22b4da4afe00c695d9714687aac8b9e4b7396
DIFF: https://github.com/llvm/llvm-project/commit/66d22b4da4afe00c695d9714687aac8b9e4b7396.diff
LOG: [VectorCombine] fold shuffle-of-binops with common operand
shuf (bo X, Y), (bo X, W) --> bo (shuf X), (shuf Y, W)
This is motivated by an example in D111800
(although that patch avoids the problem for that particular example).
The pattern is shown in reduced form with:
https://llvm.org/PR52178
https://alive2.llvm.org/ce/z/d8zB4D
There is no difference on the PhaseOrdering test from D111800
because the aarch64 cost model says that the shuffle cost is 3 while
the fadd cost is 2.
Differential Revision: https://reviews.llvm.org/D111901
Added:
Modified:
llvm/lib/Transforms/Vectorize/VectorCombine.cpp
llvm/test/Transforms/VectorCombine/X86/shuffle.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index bb2528e5b04d6..57b11e9414bac 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -102,6 +102,7 @@ class VectorCombine {
bool foldExtractedCmps(Instruction &I);
bool foldSingleElementStore(Instruction &I);
bool scalarizeLoadExtract(Instruction &I);
+ bool foldShuffleOfBinops(Instruction &I);
void replaceValue(Value &Old, Value &New) {
Old.replaceAllUsesWith(&New);
@@ -1065,6 +1066,60 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
return true;
}
+/// Try to convert "shuffle (binop), (binop)" with a shared binop operand into
+/// "binop (shuffle), (shuffle)".
+bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
+ auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
+ if (!VecTy)
+ return false;
+
+ BinaryOperator *B0, *B1;
+ ArrayRef<int> Mask;
+ if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)),
+ m_Mask(Mask))) ||
+ B0->getOpcode() != B1->getOpcode() || B0->getType() != VecTy)
+ return false;
+
+ // Try to replace a binop with a shuffle if the shuffle is not costly.
+ // The new shuffle will choose from a single, common operand, so it may be
+ // cheaper than the existing two-operand shuffle.
+ SmallVector<int> UnaryMask = createUnaryMask(Mask, Mask.size());
+ Instruction::BinaryOps Opcode = B0->getOpcode();
+ InstructionCost BinopCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
+ InstructionCost ShufCost = TTI.getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc, VecTy, UnaryMask);
+ if (ShufCost > BinopCost)
+ return false;
+
+ // If we have something like "add X, Y" and "add Z, X", swap ops to match.
+ Value *X = B0->getOperand(0), *Y = B0->getOperand(1);
+ Value *Z = B1->getOperand(0), *W = B1->getOperand(1);
+ if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W)
+ std::swap(X, Y);
+
+ Value *Shuf0, *Shuf1;
+ if (X == Z) {
+ // shuf (bo X, Y), (bo X, W) --> bo (shuf X), (shuf Y, W)
+ Shuf0 = Builder.CreateShuffleVector(X, UnaryMask);
+ Shuf1 = Builder.CreateShuffleVector(Y, W, Mask);
+ } else if (Y == W) {
+ // shuf (bo X, Y), (bo Z, Y) --> bo (shuf X, Z), (shuf Y)
+ Shuf0 = Builder.CreateShuffleVector(X, Z, Mask);
+ Shuf1 = Builder.CreateShuffleVector(Y, UnaryMask);
+ } else {
+ return false;
+ }
+
+ Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1);
+ // Intersect flags from the old binops.
+ if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
+ NewInst->copyIRFlags(B0);
+ NewInst->andIRFlags(B1);
+ }
+ replaceValue(I, *NewBO);
+ return true;
+}
+
/// This is the entry point for all transforms. Pass manager
diff erences are
/// handled in the callers of this function.
bool VectorCombine::run() {
@@ -1083,6 +1138,7 @@ bool VectorCombine::run() {
MadeChange |= foldExtractExtract(I);
MadeChange |= foldBitcastShuf(I);
MadeChange |= foldExtractedCmps(I);
+ MadeChange |= foldShuffleOfBinops(I);
}
MadeChange |= scalarizeBinopOrCmp(I);
MadeChange |= scalarizeLoadExtract(I);
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
index 14027a34f30bb..5d4dc6ae5d013 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
@@ -151,11 +151,13 @@ define <2 x i64> @PR35454_2(<2 x i64> %v) {
ret <2 x i64> %bc3
}
+; Shuffle is much cheaper than fdiv. FMF are intersected.
+
define <4 x float> @shuf_fdiv_v4f32_yy(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
; CHECK-LABEL: @shuf_fdiv_v4f32_yy(
-; CHECK-NEXT: [[B0:%.*]] = fdiv fast <4 x float> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT: [[B1:%.*]] = fdiv arcp <4 x float> [[Z:%.*]], [[Y]]
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> [[Z:%.*]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
+; CHECK-NEXT: [[R:%.*]] = fdiv arcp <4 x float> [[TMP1]], [[TMP2]]
; CHECK-NEXT: ret <4 x float> [[R]]
;
%b0 = fdiv fast <4 x float> %x, %y
@@ -164,11 +166,13 @@ define <4 x float> @shuf_fdiv_v4f32_yy(<4 x float> %x, <4 x float> %y, <4 x floa
ret <4 x float> %r
}
+; Common operand is op0 of the binops.
+
define <4 x i32> @shuf_add_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; CHECK-LABEL: @shuf_add_v4i32_xx(
-; CHECK-NEXT: [[B0:%.*]] = add <4 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT: [[B1:%.*]] = add <4 x i32> [[X]], [[Z:%.*]]
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> <i32 undef, i32 undef, i32 6, i32 0>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 undef, i32 undef, i32 2, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> <i32 undef, i32 undef, i32 6, i32 0>
+; CHECK-NEXT: [[R:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT: ret <4 x i32> [[R]]
;
%b0 = add <4 x i32> %x, %y
@@ -177,11 +181,13 @@ define <4 x i32> @shuf_add_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
ret <4 x i32> %r
}
+; For commutative instructions, common operand may be swapped.
+
define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
; CHECK-LABEL: @shuf_fmul_v4f32_xx_swap(
-; CHECK-NEXT: [[B0:%.*]] = fmul <4 x float> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT: [[B1:%.*]] = fmul <4 x float> [[Z:%.*]], [[X]]
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT: [[R:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
; CHECK-NEXT: ret <4 x float> [[R]]
;
%b0 = fmul <4 x float> %x, %y
@@ -190,11 +196,13 @@ define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x
ret <4 x float> %r
}
+; For commutative instructions, common operand may be swapped.
+
define <2 x i64> @shuf_and_v2i64_yy_swap(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
; CHECK-LABEL: @shuf_and_v2i64_yy_swap(
-; CHECK-NEXT: [[B0:%.*]] = and <2 x i64> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT: [[B1:%.*]] = and <2 x i64> [[Y]], [[Z:%.*]]
-; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i64> [[B0]], <2 x i64> [[B1]], <2 x i32> <i32 3, i32 0>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[Y:%.*]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[X:%.*]], <2 x i64> [[Z:%.*]], <2 x i32> <i32 3, i32 0>
+; CHECK-NEXT: [[R:%.*]] = and <2 x i64> [[TMP1]], [[TMP2]]
; CHECK-NEXT: ret <2 x i64> [[R]]
;
%b0 = and <2 x i64> %x, %y
@@ -203,11 +211,13 @@ define <2 x i64> @shuf_and_v2i64_yy_swap(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
ret <2 x i64> %r
}
+; non-commutative binop, but common op0
+
define <4 x i32> @shuf_shl_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; CHECK-LABEL: @shuf_shl_v4i32_xx(
-; CHECK-NEXT: [[B0:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT: [[B1:%.*]] = shl <4 x i32> [[X]], [[Z:%.*]]
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
+; CHECK-NEXT: [[R:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT: ret <4 x i32> [[R]]
;
%b0 = shl <4 x i32> %x, %y
@@ -216,6 +226,8 @@ define <4 x i32> @shuf_shl_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
ret <4 x i32> %r
}
+; negative test - common operand, but not commutable
+
define <4 x i32> @shuf_shl_v4i32_xx_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; CHECK-LABEL: @shuf_shl_v4i32_xx_swap(
; CHECK-NEXT: [[B0:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]]
@@ -229,6 +241,8 @@ define <4 x i32> @shuf_shl_v4i32_xx_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
ret <4 x i32> %r
}
+; negative test - mismatched opcodes
+
define <2 x i64> @shuf_sub_add_v2i64_yy(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
; CHECK-LABEL: @shuf_sub_add_v2i64_yy(
; CHECK-NEXT: [[B0:%.*]] = sub <2 x i64> [[X:%.*]], [[Y:%.*]]
@@ -242,6 +256,8 @@ define <2 x i64> @shuf_sub_add_v2i64_yy(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z
ret <2 x i64> %r
}
+; negative test - type change via shuffle
+
define <8 x float> @shuf_fmul_v4f32_xx_type(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
; CHECK-LABEL: @shuf_fmul_v4f32_xx_type(
; CHECK-NEXT: [[B0:%.*]] = fmul <4 x float> [[X:%.*]], [[Y:%.*]]
@@ -255,6 +271,8 @@ define <8 x float> @shuf_fmul_v4f32_xx_type(<4 x float> %x, <4 x float> %y, <4 x
ret <8 x float> %r
}
+; negative test - uses
+
define <4 x i32> @shuf_lshr_v4i32_yy_use1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; CHECK-LABEL: @shuf_lshr_v4i32_yy_use1(
; CHECK-NEXT: [[B0:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]]
@@ -270,6 +288,8 @@ define <4 x i32> @shuf_lshr_v4i32_yy_use1(<4 x i32> %x, <4 x i32> %y, <4 x i32>
ret <4 x i32> %r
}
+; negative test - uses
+
define <4 x i32> @shuf_mul_v4i32_yy_use2(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; CHECK-LABEL: @shuf_mul_v4i32_yy_use2(
; CHECK-NEXT: [[B0:%.*]] = mul <4 x i32> [[X:%.*]], [[Y:%.*]]
@@ -285,6 +305,8 @@ define <4 x i32> @shuf_mul_v4i32_yy_use2(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
ret <4 x i32> %r
}
+; negative test - must have matching operand
+
define <4 x float> @shuf_fadd_v4f32_no_common_op(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) {
; CHECK-LABEL: @shuf_fadd_v4f32_no_common_op(
; CHECK-NEXT: [[B0:%.*]] = fadd <4 x float> [[X:%.*]], [[Y:%.*]]
@@ -298,6 +320,8 @@ define <4 x float> @shuf_fadd_v4f32_no_common_op(<4 x float> %x, <4 x float> %y,
ret <4 x float> %r
}
+; negative test - binops may be relatively cheap
+
define <16 x i16> @shuf_and_v16i16_yy_expensive_shuf(<16 x i16> %x, <16 x i16> %y, <16 x i16> %z) {
; CHECK-LABEL: @shuf_and_v16i16_yy_expensive_shuf(
; CHECK-NEXT: [[B0:%.*]] = and <16 x i16> [[X:%.*]], [[Y:%.*]]
More information about the llvm-commits
mailing list