[llvm-branch-commits] [llvm] 2ed914c - [X86][SSE] getFauxShuffleMask - handle PACKSS(SRAI(), SRAI()) shuffle patterns.

Simon Pilgrim via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Tue Jan 12 06:14:11 PST 2021


Author: Simon Pilgrim
Date: 2021-01-12T14:07:53Z
New Revision: 2ed914cb7e9c0737bdf60a0b1fd48b6499973325

URL: https://github.com/llvm/llvm-project/commit/2ed914cb7e9c0737bdf60a0b1fd48b6499973325
DIFF: https://github.com/llvm/llvm-project/commit/2ed914cb7e9c0737bdf60a0b1fd48b6499973325.diff

LOG: [X86][SSE] getFauxShuffleMask - handle PACKSS(SRAI(),SRAI()) shuffle patterns.

We can't easily treat ASHR a faux shuffle, but if it was just feeding a PACKSS then it was likely being used as sign-extension for a truncation, so just peek through and adjust the mask accordingly.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/psubus.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 750c809eafca..f28e28689806 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7685,12 +7685,26 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
 
     // If we know input saturation won't happen (or we don't care for particular
     // lanes), we can treat this as a truncation shuffle.
+    bool Offset0 = false, Offset1 = false;
     if (Opcode == X86ISD::PACKSS) {
       if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
            DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
           (!(N1.isUndef() || EltsRHS.isNullValue()) &&
            DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
         return false;
+      // We can't easily fold ASHR into a shuffle, but if it was feeding a
+      // PACKSS then it was likely being used for sign-extension for a
+      // truncation, so just peek through and adjust the mask accordingly.
+      if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
+          N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
+        Offset0 = true;
+        N0 = N0.getOperand(0);
+      }
+      if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
+          N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
+        Offset1 = true;
+        N1 = N1.getOperand(0);
+      }
     } else {
       APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
       if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
@@ -7707,6 +7721,13 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
       Ops.push_back(N1);
 
     createPackShuffleMask(VT, Mask, IsUnary);
+
+    if (Offset0 || Offset1) {
+      for (int &M : Mask)
+        if ((Offset0 && isInRange(M, 0, NumElts)) ||
+            (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
+          ++M;
+    }
     return true;
   }
   case X86ISD::VTRUNC: {

diff  --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 06240cd8bad3..351629a732c1 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -1403,11 +1403,6 @@ define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; SSE2-NEXT:    psrad $16, %xmm5
 ; SSE2-NEXT:    packssdw %xmm6, %xmm5
 ; SSE2-NEXT:    psubusw %xmm5, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    packssdw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: psubus_8i32_max:
@@ -1738,111 +1733,91 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
 ; SSE2-LABEL: psubus_16i32_max:
 ; SSE2:       # %bb.0: # %vector.ph
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    movdqa %xmm5, %xmm8
 ; SSE2-NEXT:    pxor %xmm9, %xmm8
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183]
 ; SSE2-NEXT:    movdqa %xmm7, %xmm6
 ; SSE2-NEXT:    pcmpgtd %xmm8, %xmm6
 ; SSE2-NEXT:    pcmpeqd %xmm8, %xmm8
-; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pand %xmm6, %xmm5
 ; SSE2-NEXT:    pxor %xmm8, %xmm6
-; SSE2-NEXT:    por %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm5, %xmm6
 ; SSE2-NEXT:    pslld $16, %xmm6
 ; SSE2-NEXT:    psrad $16, %xmm6
-; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    movdqa %xmm4, %xmm10
 ; SSE2-NEXT:    pxor %xmm9, %xmm10
-; SSE2-NEXT:    movdqa %xmm7, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm10, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm2
-; SSE2-NEXT:    pxor %xmm8, %xmm3
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    pslld $16, %xmm3
-; SSE2-NEXT:    psrad $16, %xmm3
-; SSE2-NEXT:    packssdw %xmm6, %xmm3
-; SSE2-NEXT:    movdqa %xmm5, %xmm2
-; SSE2-NEXT:    pxor %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pslld $16, %xmm5
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    packssdw %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm9, %xmm4
 ; SSE2-NEXT:    movdqa %xmm7, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT:    pand %xmm6, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm3
 ; SSE2-NEXT:    pxor %xmm8, %xmm6
-; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    por %xmm3, %xmm6
 ; SSE2-NEXT:    pslld $16, %xmm6
 ; SSE2-NEXT:    psrad $16, %xmm6
-; SSE2-NEXT:    pxor %xmm4, %xmm9
+; SSE2-NEXT:    pxor %xmm2, %xmm9
 ; SSE2-NEXT:    pcmpgtd %xmm9, %xmm7
 ; SSE2-NEXT:    pxor %xmm7, %xmm8
-; SSE2-NEXT:    pand %xmm4, %xmm7
+; SSE2-NEXT:    pand %xmm2, %xmm7
 ; SSE2-NEXT:    por %xmm8, %xmm7
 ; SSE2-NEXT:    pslld $16, %xmm7
 ; SSE2-NEXT:    psrad $16, %xmm7
 ; SSE2-NEXT:    packssdw %xmm6, %xmm7
-; SSE2-NEXT:    psubusw %xmm7, %xmm1
-; SSE2-NEXT:    psubusw %xmm3, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    psrad $16, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    packssdw %xmm2, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    psrad $16, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    packssdw %xmm2, %xmm1
+; SSE2-NEXT:    psubusw %xmm7, %xmm0
+; SSE2-NEXT:    psubusw %xmm5, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: psubus_16i32_max:
 ; SSSE3:       # %bb.0: # %vector.ph
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm8
+; SSSE3-NEXT:    movdqa %xmm5, %xmm8
 ; SSSE3-NEXT:    pxor %xmm9, %xmm8
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183]
 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
 ; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm6
 ; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm8
-; SSSE3-NEXT:    pand %xmm6, %xmm3
+; SSSE3-NEXT:    pand %xmm6, %xmm5
 ; SSSE3-NEXT:    pxor %xmm8, %xmm6
-; SSSE3-NEXT:    por %xmm3, %xmm6
+; SSSE3-NEXT:    por %xmm5, %xmm6
 ; SSSE3-NEXT:    pslld $16, %xmm6
 ; SSSE3-NEXT:    psrad $16, %xmm6
-; SSSE3-NEXT:    movdqa %xmm2, %xmm10
+; SSSE3-NEXT:    movdqa %xmm4, %xmm10
 ; SSSE3-NEXT:    pxor %xmm9, %xmm10
-; SSSE3-NEXT:    movdqa %xmm7, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm3
-; SSSE3-NEXT:    pand %xmm3, %xmm2
-; SSSE3-NEXT:    pxor %xmm8, %xmm3
-; SSSE3-NEXT:    por %xmm2, %xmm3
-; SSSE3-NEXT:    pslld $16, %xmm3
-; SSSE3-NEXT:    psrad $16, %xmm3
-; SSSE3-NEXT:    packssdw %xmm6, %xmm3
-; SSSE3-NEXT:    movdqa %xmm5, %xmm2
-; SSSE3-NEXT:    pxor %xmm9, %xmm2
+; SSSE3-NEXT:    movdqa %xmm7, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm4
+; SSSE3-NEXT:    pxor %xmm8, %xmm5
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    pslld $16, %xmm5
+; SSSE3-NEXT:    psrad $16, %xmm5
+; SSSE3-NEXT:    packssdw %xmm6, %xmm5
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pxor %xmm9, %xmm4
 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT:    pand %xmm6, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm3
 ; SSSE3-NEXT:    pxor %xmm8, %xmm6
-; SSSE3-NEXT:    por %xmm5, %xmm6
+; SSSE3-NEXT:    por %xmm3, %xmm6
 ; SSSE3-NEXT:    pslld $16, %xmm6
 ; SSSE3-NEXT:    psrad $16, %xmm6
-; SSSE3-NEXT:    pxor %xmm4, %xmm9
+; SSSE3-NEXT:    pxor %xmm2, %xmm9
 ; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm7
 ; SSSE3-NEXT:    pxor %xmm7, %xmm8
-; SSSE3-NEXT:    pand %xmm4, %xmm7
+; SSSE3-NEXT:    pand %xmm2, %xmm7
 ; SSSE3-NEXT:    por %xmm8, %xmm7
 ; SSSE3-NEXT:    pslld $16, %xmm7
 ; SSSE3-NEXT:    psrad $16, %xmm7
 ; SSSE3-NEXT:    packssdw %xmm6, %xmm7
-; SSSE3-NEXT:    psubusw %xmm7, %xmm1
-; SSSE3-NEXT:    psubusw %xmm3, %xmm0
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT:    psrad $16, %xmm2
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT:    psrad $16, %xmm0
-; SSSE3-NEXT:    packssdw %xmm2, %xmm0
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT:    psrad $16, %xmm2
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT:    psrad $16, %xmm1
-; SSSE3-NEXT:    packssdw %xmm2, %xmm1
+; SSSE3-NEXT:    psubusw %xmm7, %xmm0
+; SSSE3-NEXT:    psubusw %xmm5, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: psubus_16i32_max:
@@ -1923,11 +1898,6 @@ define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwin
 ; SSE2-NEXT:    psrad $16, %xmm5
 ; SSE2-NEXT:    packssdw %xmm6, %xmm5
 ; SSE2-NEXT:    psubusw %xmm5, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    packssdw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: psubus_i16_i32_max_swapped:


        


More information about the llvm-branch-commits mailing list