[llvm] b7342e3 - [X86] Fold SHUFPS(shuffle(x),shuffle(y),mask) -> SHUFPS(x,y,mask')
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Sep 19 12:57:56 PDT 2021
Author: Simon Pilgrim
Date: 2021-09-19T20:39:19+01:00
New Revision: b7342e3137d8fa7c356a80c1ddecf1d410c27eef
URL: https://github.com/llvm/llvm-project/commit/b7342e3137d8fa7c356a80c1ddecf1d410c27eef
DIFF: https://github.com/llvm/llvm-project/commit/b7342e3137d8fa7c356a80c1ddecf1d410c27eef.diff
LOG: [X86] Fold SHUFPS(shuffle(x),shuffle(y),mask) -> SHUFPS(x,y,mask')
We can combine unary shuffles into either of SHUFPS's inputs and adjust the shuffle mask accordingly.
Unlike general shuffle combining, we can be more aggressive and handle multiuse cases as we're not going to accidentally create additional shuffles.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/horizontal-sum.ll
llvm/test/CodeGen/X86/vselect.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 81b011c27d0a6..3187c41a4ab1e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -38804,6 +38804,41 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
return SDValue();
}
+ case X86ISD::SHUFP: {
+ // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
+ // This is a more relaxed shuffle combiner that can ignore oneuse limits.
+ // TODO: Support types other than v4f32.
+ if (VT == MVT::v4f32) {
+ bool Updated = false;
+ SmallVector<int> Mask;
+ SmallVector<SDValue> Ops;
+ if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&
+ Ops.size() == 2) {
+ for (int i = 0; i != 2; ++i) {
+ SmallVector<SDValue> SubOps;
+ SmallVector<int> SubMask, SubScaledMask;
+ SDValue Sub = peekThroughBitcasts(Ops[i]);
+ // TODO: Scaling might be easier if we specify the demanded elts.
+ if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
+ scaleShuffleElements(SubMask, 4, SubScaledMask) &&
+ SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
+ int Ofs = i * 2;
+ Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
+ Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
+ Ops[i] = DAG.getBitcast(VT, SubOps[0]);
+ Updated = true;
+ }
+ }
+ }
+ if (Updated) {
+ for (int &M : Mask)
+ M %= 4;
+ Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
+ }
+ }
+ return SDValue();
+ }
case X86ISD::VPERMI: {
// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
// TODO: Remove when we have preferred domains in combineX86ShuffleChain.
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 88691eedcab56..ad129547c3c4f 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -56,7 +56,7 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,3]
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index 0a8283ffe3d2d..adeeb874c7bb9 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -568,13 +568,12 @@ define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) {
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: movd %edi, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3]
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1]
-; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: pandn %xmm1, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: simplify_select:
More information about the llvm-commits
mailing list