[llvm] 1a81b29 - [X86][SSE] combineCommutableSHUFP - permilps(shufps(load(), x)) --> permilps(shufps(x, load()))
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Jan 26 06:36:55 PST 2020
Author: Simon Pilgrim
Date: 2020-01-26T14:36:23Z
New Revision: 1a81b296cda53753894f8a8f38b7b89e05806307
URL: https://github.com/llvm/llvm-project/commit/1a81b296cda53753894f8a8f38b7b89e05806307
DIFF: https://github.com/llvm/llvm-project/commit/1a81b296cda53753894f8a8f38b7b89e05806307.diff
LOG: [X86][SSE] combineCommutableSHUFP - permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
Pull out combineTargetShuffle code added in rG3fd5d1c6e7db into a helper function and extend it to handle shufps(shufps(load(),x),y) and shufps(y,shufps(load(),x)) cases as well.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/oddshuffles.ll
llvm/test/CodeGen/X86/vec_insert-5.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 63e5ba859b5b..38f0f52a0440 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34551,6 +34551,59 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
return V;
}
+// Attempt to commute shufps LHS loads:
+// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
+static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
+ SelectionDAG &DAG) {
+ // TODO: Add general vXf32 + vXf64 support.
+ if (VT != MVT::v4f32)
+ return SDValue();
+
+ // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
+ auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
+ if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
+ return SDValue();
+ SDValue N0 = V.getOperand(0);
+ SDValue N1 = V.getOperand(1);
+ unsigned Imm = V.getConstantOperandVal(2);
+ if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
+ MayFoldLoad(peekThroughOneUseBitcasts(N1)))
+ return SDValue();
+ Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
+ DAG.getTargetConstant(Imm, DL, MVT::i8));
+ };
+
+ switch (N.getOpcode()) {
+ case X86ISD::VPERMILPI:
+ if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
+ unsigned Imm = N.getConstantOperandVal(1);
+ return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
+ DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
+ }
+ break;
+ case X86ISD::SHUFP: {
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+ unsigned Imm = N.getConstantOperandVal(2);
+ if (N0 == N1) {
+ if (SDValue NewSHUFP = commuteSHUFP(N, N0))
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
+ DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
+ } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
+ DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
+ } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
+ DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
+ }
+ break;
+ }
+ }
+
+ return SDValue();
+}
+
/// Try to combine x86 target specific shuffles.
static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -34588,27 +34641,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
}
- // Attempt to commute shufps LHS loads:
- // permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
- if (VT == MVT::v4f32 &&
- (X86ISD::VPERMILPI == Opcode ||
- (X86ISD::SHUFP == Opcode && N.getOperand(0) == N.getOperand(1)))) {
- SDValue N0 = N.getOperand(0);
- unsigned Imm = N.getConstantOperandVal(X86ISD::VPERMILPI == Opcode ? 1 : 2);
- if (N0.getOpcode() == X86ISD::SHUFP && N->isOnlyUserOf(N0.getNode())) {
- SDValue N00 = N0.getOperand(0);
- SDValue N01 = N0.getOperand(1);
- if (MayFoldLoad(peekThroughOneUseBitcasts(N00)) &&
- !MayFoldLoad(peekThroughOneUseBitcasts(N01))) {
- unsigned Imm1 = N0.getConstantOperandVal(2);
- Imm1 = ((Imm1 & 0x0F) << 4) | ((Imm1 & 0xF0) >> 4);
- SDValue NewN0 = DAG.getNode(X86ISD::SHUFP, DL, VT, N01, N00,
- DAG.getTargetConstant(Imm1, DL, MVT::i8));
- return DAG.getNode(X86ISD::SHUFP, DL, VT, NewN0, NewN0,
- DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
- }
- }
- }
+ if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
+ return R;
switch (Opcode) {
case X86ISD::VBROADCAST: {
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 8460e8666bf4..20db28ca3b2a 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -1198,44 +1198,42 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
; SSE2-LABEL: interleave_24i32_out:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqu 64(%rdi), %xmm10
; SSE2-NEXT: movups 80(%rdi), %xmm8
+; SSE2-NEXT: movups 64(%rdi), %xmm4
; SSE2-NEXT: movdqu (%rdi), %xmm0
-; SSE2-NEXT: movdqu 16(%rdi), %xmm11
-; SSE2-NEXT: movups 32(%rdi), %xmm5
-; SSE2-NEXT: movdqu 48(%rdi), %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
-; SSE2-NEXT: movaps %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[2,0]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[2,0]
-; SSE2-NEXT: movaps %xmm8, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm10[2,0]
-; SSE2-NEXT: movdqa %xmm9, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm8[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0]
-; SSE2-NEXT: movaps %xmm2, %xmm4
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm10[3,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0]
-; SSE2-NEXT: movaps %xmm0, %xmm4
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm11[3,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[2,0]
-; SSE2-NEXT: movups %xmm9, 16(%rsi)
-; SSE2-NEXT: movups %xmm3, (%rsi)
-; SSE2-NEXT: movups %xmm2, 16(%rdx)
+; SSE2-NEXT: movups 16(%rdi), %xmm6
+; SSE2-NEXT: movups 32(%rdi), %xmm10
+; SSE2-NEXT: movups 48(%rdi), %xmm12
+; SSE2-NEXT: movdqa %xmm0, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0]
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,2,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm10[1,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm6[0,2]
+; SSE2-NEXT: movaps %xmm12, %xmm6
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm4[0,0]
+; SSE2-NEXT: movaps %xmm6, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,2,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[1,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,3,0,1]
+; SSE2-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm4[0,2]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm8[2,0]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[2,0]
+; SSE2-NEXT: movups %xmm12, 16(%rsi)
+; SSE2-NEXT: movups %xmm11, (%rsi)
+; SSE2-NEXT: movups %xmm6, 16(%rdx)
; SSE2-NEXT: movups %xmm0, (%rdx)
-; SSE2-NEXT: movups %xmm1, 16(%rcx)
-; SSE2-NEXT: movups %xmm6, (%rcx)
+; SSE2-NEXT: movups %xmm5, 16(%rcx)
+; SSE2-NEXT: movups %xmm7, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: interleave_24i32_out:
diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll
index cd9d2692ff3f..c6815a278f82 100644
--- a/llvm/test/CodeGen/X86/vec_insert-5.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-5.ll
@@ -34,18 +34,18 @@ define <4 x float> @t2(<4 x float>* %P) nounwind {
; X32-LABEL: t2:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movaps (%eax), %xmm1
; X32-NEXT: xorps %xmm0, %xmm0
-; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
-; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; X32-NEXT: xorps %xmm1, %xmm1
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0]
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; X32-NEXT: retl
;
; X64-LABEL: t2:
; X64: # %bb.0:
-; X64-NEXT: movaps (%rdi), %xmm1
; X64-NEXT: xorps %xmm0, %xmm0
-; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
-; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; X64-NEXT: xorps %xmm1, %xmm1
+; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0]
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; X64-NEXT: retq
%tmp1 = load <4 x float>, <4 x float>* %P
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
@@ -74,18 +74,18 @@ define <4 x float> @t4(<4 x float>* %P) nounwind {
; X32-LABEL: t4:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movaps (%eax), %xmm0
; X32-NEXT: xorps %xmm1, %xmm1
-; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
-; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; X32-NEXT: xorps %xmm0, %xmm0
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,0]
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; X32-NEXT: retl
;
; X64-LABEL: t4:
; X64: # %bb.0:
-; X64-NEXT: movaps (%rdi), %xmm0
; X64-NEXT: xorps %xmm1, %xmm1
-; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
-; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,0]
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; X64-NEXT: retq
%tmp1 = load <4 x float>, <4 x float>* %P
%tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index dd67f9bfc430..9ba639784e1d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -2496,16 +2496,15 @@ define <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, <4 x float>* %a1) {
define <4 x float> @shuffle_mem_v4f32_4760(<4 x float> %a0, <4 x float>* %a1) {
; SSE-LABEL: shuffle_mem_v4f32_4760:
; SSE: # %bb.0:
-; SSE-NEXT: movaps (%rdi), %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0]
+; SSE-NEXT: movaps %xmm0, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_mem_v4f32_4760:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovaps (%rdi), %xmm1
-; AVX1OR2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
-; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0]
+; AVX1OR2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,0],mem[0,0]
+; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_mem_v4f32_4760:
More information about the llvm-commits
mailing list