[llvm] r340810 - [X86][SSE] Avoid vector extraction/insertion for non-constant uniform shifts

Tue Aug 28 03:14:09 PDT 2018

Author: rksimon
Date: Tue Aug 28 03:14:09 2018
New Revision: 340810

URL: http://llvm.org/viewvc/llvm-project?rev=340810&view=rev
Log:
[X86][SSE] Avoid vector extraction/insertion for non-constant uniform shifts

As discussed on D51263, we're better off using byte shifts to clear the upper bits on pre-SSE41 hardware.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/lower-vec-shift-2.ll
    llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll
    llvm/trunk/test/CodeGen/X86/vector-shift-ashr-128.ll
    llvm/trunk/test/CodeGen/X86/vector-shift-lshr-128.ll
    llvm/trunk/test/CodeGen/X86/vector-shift-shl-128.ll
    llvm/trunk/test/CodeGen/X86/vshift-4.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=340810&r1=340809&r2=340810&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Aug 28 03:14:09 2018
@@ -20451,30 +20451,43 @@ static SDValue getTargetVShiftNode(unsig
   // | i64             | Yes, No    | Use ShAmt as lowest elt               |
   // | i32             | Yes        | zero-extend in-reg                    |
   // | (i32 zext(i16)) | Yes        | zero-extend in-reg                    |
+  // | (i32 zext(i16)) | No         | byte-shift-in-reg                     |
   // | i16/i32         | No         | v4i32 build_vector(ShAmt, 0, ud, ud)) |
   // +=================+============+=======================================+
 
   if (SVT == MVT::i64)
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
-  else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
+  else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
+           ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
            ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
     ShAmt = ShAmt.getOperand(0);
-    ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
-    ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
+    MVT AmtTy = MVT::v8i16;
+    ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
+    if (Subtarget.hasSSE41())
+      ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
+    else {
+      SDValue ByteShift = DAG.getConstant(
+          (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
+      ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
+      ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
+                          ByteShift);
+      ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
+                          ByteShift);
+    }
   } else if (Subtarget.hasSSE41() &&
              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
     ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
   } else {
-    SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
-                        DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
+    SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
+                        DAG.getUNDEF(SVT)};
     ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
   }
 
   // The return type has to be a 128-bit type with the same element
   // type as the input type.
   MVT EltVT = VT.getVectorElementType();
-  MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
+  MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
 
   ShAmt = DAG.getBitcast(ShVT, ShAmt);
   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);

Modified: llvm/trunk/test/CodeGen/X86/lower-vec-shift-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/lower-vec-shift-2.ll?rev=340810&r1=340809&r2=340810&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/lower-vec-shift-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/lower-vec-shift-2.ll Tue Aug 28 03:14:09 2018
@@ -5,8 +5,8 @@
 define <8 x i16> @test1(<8 x i16> %A, <8 x i16> %B) {
 ; SSE2-LABEL: test1:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    pextrw $0, %xmm1, %eax
-; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE2-NEXT:    psllw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -59,8 +59,8 @@ entry:
 define <8 x i16> @test4(<8 x i16> %A, <8 x i16> %B) {
 ; SSE2-LABEL: test4:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    pextrw $0, %xmm1, %eax
-; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE2-NEXT:    psrlw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -113,8 +113,8 @@ entry:
 define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) {
 ; SSE2-LABEL: test7:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    pextrw $0, %xmm1, %eax
-; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE2-NEXT:    psraw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;

Modified: llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll?rev=340810&r1=340809&r2=340810&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll Tue Aug 28 03:14:09 2018
@@ -877,15 +877,15 @@ define <8 x i16> @splatvar_rotate_v8i16(
 ; SSE2-LABEL: splatvar_rotate_v8i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT:    pextrw $0, %xmm1, %eax
-; SSE2-NEXT:    movd %eax, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    psllw %xmm1, %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16]
 ; SSE2-NEXT:    psubw %xmm2, %xmm1
-; SSE2-NEXT:    pextrw $0, %xmm1, %eax
-; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE2-NEXT:    psrlw %xmm1, %xmm0
 ; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    retq
@@ -993,15 +993,15 @@ define <8 x i16> @splatvar_rotate_v8i16(
 ; X32-SSE-LABEL: splatvar_rotate_v8i16:
 ; X32-SSE:       # %bb.0:
 ; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; X32-SSE-NEXT:    pextrw $0, %xmm1, %eax
-; X32-SSE-NEXT:    movd %eax, %xmm1
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
 ; X32-SSE-NEXT:    psllw %xmm1, %xmm3
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16]
 ; X32-SSE-NEXT:    psubw %xmm2, %xmm1
-; X32-SSE-NEXT:    pextrw $0, %xmm1, %eax
-; X32-SSE-NEXT:    movd %eax, %xmm1
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    psrlw %xmm1, %xmm0
 ; X32-SSE-NEXT:    por %xmm3, %xmm0
 ; X32-SSE-NEXT:    retl

Modified: llvm/trunk/test/CodeGen/X86/vector-shift-ashr-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shift-ashr-128.ll?rev=340810&r1=340809&r2=340810&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shift-ashr-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shift-ashr-128.ll Tue Aug 28 03:14:09 2018
@@ -736,8 +736,8 @@ define <4 x i32> @splatvar_shift_v4i32(<
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-LABEL: splatvar_shift_v8i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pextrw $0, %xmm1, %eax
-; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE2-NEXT:    psraw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -773,8 +773,8 @@ define <8 x i16> @splatvar_shift_v8i16(<
 ;
 ; X32-SSE-LABEL: splatvar_shift_v8i16:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pextrw $0, %xmm1, %eax
-; X32-SSE-NEXT:    movd %eax, %xmm1
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    psraw %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer

Modified: llvm/trunk/test/CodeGen/X86/vector-shift-lshr-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shift-lshr-128.ll?rev=340810&r1=340809&r2=340810&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shift-lshr-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shift-lshr-128.ll Tue Aug 28 03:14:09 2018
@@ -603,8 +603,8 @@ define <4 x i32> @splatvar_shift_v4i32(<
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-LABEL: splatvar_shift_v8i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pextrw $0, %xmm1, %eax
-; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE2-NEXT:    psrlw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -640,8 +640,8 @@ define <8 x i16> @splatvar_shift_v8i16(<
 ;
 ; X32-SSE-LABEL: splatvar_shift_v8i16:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pextrw $0, %xmm1, %eax
-; X32-SSE-NEXT:    movd %eax, %xmm1
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    psrlw %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer

Modified: llvm/trunk/test/CodeGen/X86/vector-shift-shl-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shift-shl-128.ll?rev=340810&r1=340809&r2=340810&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shift-shl-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shift-shl-128.ll Tue Aug 28 03:14:09 2018
@@ -520,8 +520,8 @@ define <4 x i32> @splatvar_shift_v4i32(<
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-LABEL: splatvar_shift_v8i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pextrw $0, %xmm1, %eax
-; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE2-NEXT:    psllw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -557,8 +557,8 @@ define <8 x i16> @splatvar_shift_v8i16(<
 ;
 ; X32-SSE-LABEL: splatvar_shift_v8i16:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pextrw $0, %xmm1, %eax
-; X32-SSE-NEXT:    movd %eax, %xmm1
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    psllw %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer

Modified: llvm/trunk/test/CodeGen/X86/vshift-4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vshift-4.ll?rev=340810&r1=340809&r2=340810&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vshift-4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vshift-4.ll Tue Aug 28 03:14:09 2018
@@ -136,16 +136,18 @@ define void @shift3a(<8 x i16> %val, <8
 ; X32-LABEL: shift3a:
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    pextrw $6, %xmm1, %ecx
-; X32-NEXT:    movd %ecx, %xmm1
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; X32-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-NEXT:    psllw %xmm1, %xmm0
 ; X32-NEXT:    movdqa %xmm0, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: shift3a:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    pextrw $6, %xmm1, %eax
-; X64-NEXT:    movd %eax, %xmm1
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; X64-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X64-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X64-NEXT:    psllw %xmm1, %xmm0
 ; X64-NEXT:    movdqa %xmm0, (%rdi)
 ; X64-NEXT:    retq