[llvm] r325354 - [SelectionDAG] Enable SimplifyDemandedVectorElts support for simplifying shuffle masks

Fri Feb 16 08:22:14 PST 2018

Author: rksimon
Date: Fri Feb 16 08:22:14 2018
New Revision: 325354

URL: http://llvm.org/viewvc/llvm-project?rev=325354&view=rev
Log:
[SelectionDAG] Enable SimplifyDemandedVectorElts support for simplifying shuffle masks

Based off the DemandedElts mask the and UNDEF elements returned from the SimplifyDemandedVectorElts calls to the shuffle operands, we can attempt to simplify the shuffle mask.

I had to be very conservative here as accepting post-legalized shuffle masks could cause problems for targets that legalize UNDEF mask elements back to inrange values (PowerPC), similarly combining to identity shuffle masks could cause too much UNDEF information to disappear for later combines.

Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/trunk/test/CodeGen/X86/sse3.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp?rev=325354&r1=325353&r2=325354&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp Fri Feb 16 08:22:14 2018
@@ -1458,6 +1458,31 @@ bool TargetLowering::SimplifyDemandedVec
                                    ZeroRHS, TLO, Depth + 1))
       return true;
 
+    // Simplify mask using undef elements from LHS/RHS.
+    bool Updated = false;
+    bool IdentityLHS = true, IdentityRHS = true;
+    SmallVector<int, 32> NewMask(ShuffleMask.begin(), ShuffleMask.end());
+    for (int i = 0; i != NumElts; ++i) {
+      int &M = NewMask[i];
+      if (M < 0)
+        continue;
+      if (!DemandedElts[i] || (M < (int)NumElts && UndefLHS[M]) ||
+          (M >= (int)NumElts && UndefRHS[M - NumElts])) {
+        Updated = true;
+        M = -1;
+      }
+      IdentityLHS &= (M < 0) || (M == i);
+      IdentityRHS &= (M < 0) || ((M - NumElts) == i);
+    }
+
+    // Update legal shuffle masks based on demanded elements if it won't reduce
+    // to Identity which can cause premature removal of the shuffle mask.
+    if (Updated && !IdentityLHS && !IdentityRHS && !TLO.LegalOps &&
+        isShuffleMaskLegal(NewMask, VT))
+      return TLO.CombineTo(Op,
+                           TLO.DAG.getVectorShuffle(VT, DL, Op.getOperand(0),
+                                                    Op.getOperand(1), NewMask));
+
     // Propagate undef/zero elements from LHS/RHS.
     for (unsigned i = 0; i != NumElts; ++i) {
       int M = ShuffleMask[i];

Modified: llvm/trunk/test/CodeGen/X86/sse3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse3.ll?rev=325354&r1=325353&r2=325354&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse3.ll Fri Feb 16 08:22:14 2018
@@ -396,18 +396,14 @@ entry:
 define <4 x i32> @t17() nounwind {
 ; X86-LABEL: t17:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movaps (%eax), %xmm0
-; X86-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X86-NEXT:    xorps %xmm1, %xmm1
-; X86-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
+; X86-NEXT:    andpd {{\.LCPI.*}}, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t17:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movaps (%rax), %xmm0
-; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
+; X64-NEXT:    andpd {{.*}}(%rip), %xmm0
 ; X64-NEXT:    retq
 entry:
   %tmp1 = load <4 x float>, <4 x float>* undef, align 16

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll?rev=325354&r1=325353&r2=325354&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll Fri Feb 16 08:22:14 2018
@@ -1563,22 +1563,22 @@ define <4 x i32> @shuffle_v4i32_2456(<4
 ;
 ; SSSE3-LABEL: shuffle_v4i32_2456:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v4i32_2456:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,0,1,2]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v4i32_2456:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
 ; AVX-NEXT:    retq
   %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
   %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll?rev=325354&r1=325353&r2=325354&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll Fri Feb 16 08:22:14 2018
@@ -973,45 +973,25 @@ define <32 x i8> @PR27320(<8 x i32> %a0)
 }
 
 define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) {
-; X32-AVX2-LABEL: PR34577:
-; X32-AVX2:       # %bb.0: # %entry
-; X32-AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X32-AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X32-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
-; X32-AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
-; X32-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; X32-AVX2-NEXT:    retl
+; X32-LABEL: PR34577:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; X32-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; X32-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
+; X32-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; X32-NEXT:    retl
 ;
-; X32-AVX512-LABEL: PR34577:
-; X32-AVX512:       # %bb.0: # %entry
-; X32-AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X32-AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; X32-AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; X32-AVX512-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
-; X32-AVX512-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; X32-AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; X32-AVX512-NEXT:    retl
-;
-; X64-AVX2-LABEL: PR34577:
-; X64-AVX2:       # %bb.0: # %entry
-; X64-AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; X64-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
-; X64-AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
-; X64-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: PR34577:
-; X64-AVX512:       # %bb.0: # %entry
-; X64-AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X64-AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; X64-AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; X64-AVX512-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
-; X64-AVX512-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; X64-AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; X64-AVX512-NEXT:    retq
+; X64-LABEL: PR34577:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; X64-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; X64-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
+; X64-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; X64-NEXT:    retq
 entry:
   %shuf0 = shufflevector <8 x float> %inp0, <8 x float> %inp2, <8 x i32> <i32 1, i32 10, i32 11, i32 13, i32 2, i32 13, i32 5, i32 0>
   %sel = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x float> %shuf0, <8 x float> zeroinitializer