[llvm] [VectorCombine] foldPermuteOfBinops - fold "shuffle (binop (shuffle, other)), undef" --> "binop (shuffle), (shuffle)". (PR #122118)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 8 06:44:42 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-vectorizers
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
foldPermuteOfBinops currently requires both binop operands to be oneuse shuffles to fold the shuffles across the binop, but there will be cases where its still profitable to fold across the binop with only one foldable shuffle.
---
Full diff: https://github.com/llvm/llvm-project/pull/122118.diff
4 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/VectorCombine.cpp (+25-19)
- (modified) llvm/test/Transforms/PhaseOrdering/X86/hadd.ll (+7-7)
- (modified) llvm/test/Transforms/PhaseOrdering/X86/hsub.ll (+7-7)
- (modified) llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll (+18-12)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 120eafae8c5ac5..e9478103f36288 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1592,17 +1592,21 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
return false;
- Value *Op00, *Op01;
- ArrayRef<int> Mask0;
- if (!match(BinOp->getOperand(0),
- m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)))))
+ Value *Op00, *Op01, *Op10, *Op11;
+ ArrayRef<int> Mask0, Mask1;
+ bool Match0 =
+ match(BinOp->getOperand(0),
+ m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
+ bool Match1 =
+ match(BinOp->getOperand(1),
+ m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
+ if (!Match0 && !Match1)
return false;
- Value *Op10, *Op11;
- ArrayRef<int> Mask1;
- if (!match(BinOp->getOperand(1),
- m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)))))
- return false;
+ Op00 = Match0 ? Op00 : BinOp->getOperand(0);
+ Op01 = Match0 ? Op01 : BinOp->getOperand(0);
+ Op10 = Match1 ? Op10 : BinOp->getOperand(1);
+ Op11 = Match1 ? Op11 : BinOp->getOperand(1);
Instruction::BinaryOps Opcode = BinOp->getOpcode();
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
@@ -1620,15 +1624,15 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
return false;
- // Merge outer / inner shuffles.
+ // Merge outer / inner (or identity if no match) shuffles.
SmallVector<int> NewMask0, NewMask1;
for (int M : OuterMask) {
if (M < 0 || M >= (int)NumSrcElts) {
NewMask0.push_back(PoisonMaskElem);
NewMask1.push_back(PoisonMaskElem);
} else {
- NewMask0.push_back(Mask0[M]);
- NewMask1.push_back(Mask1[M]);
+ NewMask0.push_back(Match0 ? Mask0[M] : M);
+ NewMask1.push_back(Match1 ? Mask1[M] : M);
}
}
@@ -1636,13 +1640,15 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
InstructionCost OldCost =
TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, BinOpTy,
- OuterMask, CostKind, 0, nullptr, {BinOp}, &I) +
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, Mask0,
- CostKind, 0, nullptr, {Op00, Op01},
- cast<Instruction>(BinOp->getOperand(0))) +
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty, Mask1,
- CostKind, 0, nullptr, {Op10, Op11},
- cast<Instruction>(BinOp->getOperand(1)));
+ OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
+ if (Match0)
+ OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty,
+ Mask0, CostKind, 0, nullptr, {Op00, Op01},
+ cast<Instruction>(BinOp->getOperand(0)));
+ if (Match1)
+ OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty,
+ Mask1, CostKind, 0, nullptr, {Op10, Op11},
+ cast<Instruction>(BinOp->getOperand(1)));
InstructionCost NewCost =
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, NewMask0,
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index a4aea02a335117..4863ea91803ad5 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -801,10 +801,10 @@ define <2 x double> @add_v2f64_01(<2 x double> %a, <2 x double> %b) {
define <2 x double> @add_v2f64_u1(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @add_v2f64_u1(
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[B]], [[SHIFT]]
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT: ret <2 x double> [[RESULT]]
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 1>
+; CHECK-NEXT: [[RESULT1:%.*]] = fadd <2 x double> [[RESULT]], [[TMP2]]
+; CHECK-NEXT: ret <2 x double> [[RESULT1]]
;
%a0 = extractelement <2 x double> %a, i32 0
%a1 = extractelement <2 x double> %a, i32 1
@@ -820,9 +820,9 @@ define <2 x double> @add_v2f64_u1(<2 x double> %a, <2 x double> %b) {
define <2 x double> @add_v2f64_0u(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @add_v2f64_0u(
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[A]], [[SHIFT]]
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[RESULT:%.*]] = fadd <2 x double> [[TMP1]], [[SHIFT]]
; CHECK-NEXT: ret <2 x double> [[RESULT]]
;
%a0 = extractelement <2 x double> %a, i32 0
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index bcb316a4a73ea6..4f67ee0cb18c04 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -801,10 +801,10 @@ define <2 x double> @sub_v2f64_01(<2 x double> %a, <2 x double> %b) {
define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @sub_v2f64_u1(
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> [[B]], [[SHIFT]]
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT: ret <2 x double> [[RESULT]]
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 1>
+; CHECK-NEXT: [[RESULT1:%.*]] = fsub <2 x double> [[RESULT]], [[TMP2]]
+; CHECK-NEXT: ret <2 x double> [[RESULT1]]
;
%a0 = extractelement <2 x double> %a, i32 0
%a1 = extractelement <2 x double> %a, i32 1
@@ -820,9 +820,9 @@ define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
define <2 x double> @sub_v2f64_0u(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @sub_v2f64_0u(
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> [[A]], [[SHIFT]]
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[RESULT:%.*]] = fsub <2 x double> [[TMP1]], [[SHIFT]]
; CHECK-NEXT: ret <2 x double> [[RESULT]]
;
%a0 = extractelement <2 x double> %a, i32 0
diff --git a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
index 8db1990dcbb5d8..1dc324bbd63ff9 100644
--- a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
@@ -70,16 +70,25 @@ define <4 x double> @fadd_v4f64_multiuse_op(<4 x double> %a, <4 x double> %b) {
ret <4 x double> %post
}
-; Negative test - multiple use of inner shuffle
+; Negative test - multiple use of inner shuffle (only fold if the moved shuffle is cheaper).
define <4 x double> @fadd_v4f64_multiuse_shuffle(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
-; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[B1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT: [[OP:%.*]] = fadd <4 x double> [[A1]], [[B1]]
-; CHECK-NEXT: [[POST:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT: call void @use_v4f64(<4 x double> [[A1]])
-; CHECK-NEXT: ret <4 x double> [[POST]]
+; SSE-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
+; SSE-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; SSE-NEXT: [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT: call void @use_v4f64(<4 x double> [[A1]])
+; SSE-NEXT: ret <4 x double> [[POST]]
+;
+; AVX-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
+; AVX-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; AVX-NEXT: [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT: [[B1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; AVX-NEXT: [[OP:%.*]] = fadd <4 x double> [[A1]], [[B1]]
+; AVX-NEXT: [[POST:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; AVX-NEXT: call void @use_v4f64(<4 x double> [[A1]])
+; AVX-NEXT: ret <4 x double> [[POST]]
;
%a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%b1 = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
@@ -137,6 +146,3 @@ define <4 x i32> @sdiv_v4i32_poison_idx(<4 x i32> %a, <4 x i32> %b) {
%post = shufflevector <4 x i32> %op, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 4>
ret <4 x i32> %post
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}
``````````
</details>
https://github.com/llvm/llvm-project/pull/122118
More information about the llvm-commits
mailing list