[llvm] r318468 - [X86] Add DAG combine to remove sext i32->i64 from gather/scatter instructions.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 16 15:09:06 PST 2017


Author: ctopper
Date: Thu Nov 16 15:09:06 2017
New Revision: 318468

URL: http://llvm.org/viewvc/llvm-project?rev=318468&view=rev
Log:
[X86] Add DAG combine to remove sext i32->i64 from gather/scatter instructions.

Only do this pre-legalize in case we're using the sign extend to legalize for KNL.

This recovers all of the tests that changed when I stopped SelectionDAGBuilder from deleting sign extends.

There's more work that could be done here particularly to fix the i8->i64 test case that experienced split.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=318468&r1=318467&r2=318468&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Nov 16 15:09:06 2017
@@ -35836,7 +35836,7 @@ static SDValue combineGatherScatter(SDNo
   // Pre-shrink oversized index elements to avoid triggering scalarization.
   if (DCI.isBeforeLegalize()) {
     SDValue Index = N->getOperand(4);
-    if (Index.getValueType().getScalarSizeInBits() > 64) {
+    if (Index.getScalarValueSizeInBits() > 64) {
       EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), MVT::i64,
                                    Index.getValueType().getVectorNumElements());
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index);
@@ -35846,6 +35846,27 @@ static SDValue combineGatherScatter(SDNo
       DCI.AddToWorklist(N);
       return SDValue(N, 0);
     }
+  }
+
+  // Try to remove sign extends from i32 to i64 on the index.
+  // Only do this before legalize in case we are relying on it for
+  // legalization.
+  // TODO: We should maybe remove any sign extend once we learn how to sign
+  // extend narrow index during lowering.
+  if (DCI.isBeforeLegalizeOps()) {
+    SDValue Index = N->getOperand(4);
+    if (Index.getScalarValueSizeInBits() == 64 &&
+        Index.getOpcode() == ISD::SIGN_EXTEND &&
+        Index.getOperand(0).getScalarValueSizeInBits() == 32) {
+      SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+      NewOps[4] = Index.getOperand(0);
+      DAG.UpdateNodeOperands(N, NewOps);
+      // The original sign extend has less users, add back to worklist in case
+      // it needs to be removed.
+      DCI.AddToWorklist(Index.getNode());
+      DCI.AddToWorklist(N);
+      return SDValue(N, 0);
+    }
   }
 
   // Gather and Scatter instructions use k-registers for masks. The type of

Modified: llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll?rev=318468&r1=318467&r2=318468&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll Thu Nov 16 15:09:06 2017
@@ -19,52 +19,32 @@
 define <16 x float> @test1(float* %base, <16 x i32> %ind) {
 ; KNL_64-LABEL: test1:
 ; KNL_64:       # BB#0:
-; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm1
-; KNL_64-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
-; KNL_64-NEXT:    kxnorw %k0, %k0, %k2
-; KNL_64-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
-; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; KNL_64-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test1:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
-; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
-; KNL_32-NEXT:    kxnorw %k0, %k0, %k2
-; KNL_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
-; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; KNL_32-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test1:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpmovsxdq %ymm0, %zmm1
-; SKX-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; SKX-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
-; SKX-NEXT:    kxnorw %k0, %k0, %k2
-; SKX-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
-; SKX-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; SKX-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test1:
 ; SKX_32:       # BB#0:
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vpmovsxdq %ymm0, %zmm1
-; SKX_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; SKX_32-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
-; SKX_32-NEXT:    kxnorw %k0, %k0, %k2
-; SKX_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
-; SKX_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; SKX_32-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX_32-NEXT:    retl
 
   %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
@@ -96,52 +76,32 @@ declare <8 x i32> @llvm.masked.gather.v8
 define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
 ; KNL_64-LABEL: test2:
 ; KNL_64:       # BB#0:
-; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm1
-; KNL_64-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; KNL_64-NEXT:    kmovw %esi, %k1
-; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
-; KNL_64-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
-; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; KNL_64-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test2:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
-; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
-; KNL_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
-; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; KNL_32-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test2:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpmovsxdq %ymm0, %zmm1
-; SKX-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; SKX-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; SKX-NEXT:    kmovw %esi, %k1
-; SKX-NEXT:    kshiftrw $8, %k1, %k2
-; SKX-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
-; SKX-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; SKX-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test2:
 ; SKX_32:       # BB#0:
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vpmovsxdq %ymm0, %zmm1
-; SKX_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; SKX_32-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; SKX_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
-; SKX_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
-; SKX_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; SKX_32-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX_32-NEXT:    retl
 
   %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
@@ -157,52 +117,32 @@ define <16 x float> @test2(float* %base,
 define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
 ; KNL_64-LABEL: test3:
 ; KNL_64:       # BB#0:
-; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm1
-; KNL_64-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; KNL_64-NEXT:    kmovw %esi, %k1
-; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
-; KNL_64-NEXT:    vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k2}
-; KNL_64-NEXT:    vpgatherqd (%rdi,%zmm1,4), %ymm0 {%k1}
-; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test3:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
-; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
-; KNL_32-NEXT:    vpgatherqd (%eax,%zmm0,4), %ymm2 {%k2}
-; KNL_32-NEXT:    vpgatherqd (%eax,%zmm1,4), %ymm0 {%k1}
-; KNL_32-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test3:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpmovsxdq %ymm0, %zmm1
-; SKX-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; SKX-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; SKX-NEXT:    kmovw %esi, %k1
-; SKX-NEXT:    kshiftrw $8, %k1, %k2
-; SKX-NEXT:    vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k2}
-; SKX-NEXT:    vpgatherqd (%rdi,%zmm1,4), %ymm0 {%k1}
-; SKX-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test3:
 ; SKX_32:       # BB#0:
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vpmovsxdq %ymm0, %zmm1
-; SKX_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; SKX_32-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; SKX_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
-; SKX_32-NEXT:    vpgatherqd (%eax,%zmm0,4), %ymm2 {%k2}
-; SKX_32-NEXT:    vpgatherqd (%eax,%zmm1,4), %ymm0 {%k1}
-; SKX_32-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
+; SKX_32-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; SKX_32-NEXT:    retl
 
   %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
@@ -731,52 +671,32 @@ define <16 x float> @test11(float* %base
 define <16 x float> @test12(float* %base, <16 x i32> %ind) {
 ; KNL_64-LABEL: test12:
 ; KNL_64:       # BB#0:
-; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm1
-; KNL_64-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
-; KNL_64-NEXT:    kxnorw %k0, %k0, %k2
-; KNL_64-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
-; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; KNL_64-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test12:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
-; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
-; KNL_32-NEXT:    kxnorw %k0, %k0, %k2
-; KNL_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
-; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; KNL_32-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test12:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpmovsxdq %ymm0, %zmm1
-; SKX-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; SKX-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
-; SKX-NEXT:    kxnorw %k0, %k0, %k2
-; SKX-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
-; SKX-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; SKX-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test12:
 ; SKX_32:       # BB#0:
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vpmovsxdq %ymm0, %zmm1
-; SKX_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; SKX_32-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
-; SKX_32-NEXT:    kxnorw %k0, %k0, %k2
-; SKX_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
-; SKX_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; SKX_32-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX_32-NEXT:    retl
 
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -790,52 +710,32 @@ define <16 x float> @test12(float* %base
 define <16 x float> @test13(float* %base, <16 x i32> %ind) {
 ; KNL_64-LABEL: test13:
 ; KNL_64:       # BB#0:
-; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm1
-; KNL_64-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
-; KNL_64-NEXT:    kxnorw %k0, %k0, %k2
-; KNL_64-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
-; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; KNL_64-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test13:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
-; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
-; KNL_32-NEXT:    kxnorw %k0, %k0, %k2
-; KNL_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
-; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; KNL_32-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test13:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpmovsxdq %ymm0, %zmm1
-; SKX-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; SKX-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
-; SKX-NEXT:    kxnorw %k0, %k0, %k2
-; SKX-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
-; SKX-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; SKX-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test13:
 ; SKX_32:       # BB#0:
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vpmovsxdq %ymm0, %zmm1
-; SKX_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; SKX_32-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
-; SKX_32-NEXT:    kxnorw %k0, %k0, %k2
-; SKX_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
-; SKX_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; SKX_32-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX_32-NEXT:    retl
 
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -911,8 +811,9 @@ declare <2 x double> @llvm.masked.gather
 define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
 ; KNL_64-LABEL: test15:
 ; KNL_64:       # BB#0:
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; KNL_64-NEXT:    vmovdqa %xmm1, %xmm1
-; KNL_64-NEXT:    vpmovsxdq %xmm0, %ymm2
+; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm2
 ; KNL_64-NEXT:    vpslld $31, %ymm1, %ymm0
 ; KNL_64-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; KNL_64-NEXT:    vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
@@ -922,9 +823,10 @@ define <4 x float> @test15(float* %base,
 ;
 ; KNL_32-LABEL: test15:
 ; KNL_32:       # BB#0:
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; KNL_32-NEXT:    vmovdqa %xmm1, %xmm1
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpmovsxdq %xmm0, %ymm2
+; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm2
 ; KNL_32-NEXT:    vpslld $31, %ymm1, %ymm0
 ; KNL_32-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; KNL_32-NEXT:    vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
@@ -936,9 +838,8 @@ define <4 x float> @test15(float* %base,
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
 ; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
-; SKX-NEXT:    vpmovsxdq %xmm0, %ymm1
-; SKX-NEXT:    vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1}
-; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
+; SKX-NEXT:    vmovaps %xmm1, %xmm0
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test15:
@@ -946,9 +847,8 @@ define <4 x float> @test15(float* %base,
 ; SKX_32-NEXT:    vpslld $31, %xmm1, %xmm1
 ; SKX_32-NEXT:    vptestmd %xmm1, %xmm1, %k1
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vpmovsxdq %xmm0, %ymm1
-; SKX_32-NEXT:    vgatherqps (%eax,%ymm1,4), %xmm0 {%k1}
-; SKX_32-NEXT:    vzeroupper
+; SKX_32-NEXT:    vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
+; SKX_32-NEXT:    vmovaps %xmm1, %xmm0
 ; SKX_32-NEXT:    retl
 
   %sext_ind = sext <4 x i32> %ind to <4 x i64>
@@ -962,11 +862,12 @@ define <4 x double> @test16(double* %bas
 ; KNL_64-LABEL: test16:
 ; KNL_64:       # BB#0:
 ; KNL_64-NEXT:    # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
 ; KNL_64-NEXT:    vpsrad $31, %xmm1, %xmm1
 ; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
 ; KNL_64-NEXT:    vmovdqa %ymm1, %ymm1
-; KNL_64-NEXT:    vpmovsxdq %xmm0, %ymm0
+; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_64-NEXT:    vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
@@ -976,12 +877,13 @@ define <4 x double> @test16(double* %bas
 ; KNL_32-LABEL: test16:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
 ; KNL_32-NEXT:    vpsrad $31, %xmm1, %xmm1
 ; KNL_32-NEXT:    vpmovsxdq %xmm1, %ymm1
 ; KNL_32-NEXT:    vmovdqa %ymm1, %ymm1
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpmovsxdq %xmm0, %ymm0
+; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; KNL_32-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
@@ -992,8 +894,7 @@ define <4 x double> @test16(double* %bas
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
 ; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
-; SKX-NEXT:    vpmovsxdq %xmm0, %ymm0
-; SKX-NEXT:    vgatherqpd (%rdi,%ymm0,8), %ymm2 {%k1}
+; SKX-NEXT:    vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
 ; SKX-NEXT:    vmovapd %ymm2, %ymm0
 ; SKX-NEXT:    retq
 ;
@@ -1002,8 +903,7 @@ define <4 x double> @test16(double* %bas
 ; SKX_32-NEXT:    vpslld $31, %xmm1, %xmm1
 ; SKX_32-NEXT:    vptestmd %xmm1, %xmm1, %k1
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vpmovsxdq %xmm0, %ymm0
-; SKX_32-NEXT:    vgatherqpd (%eax,%ymm0,8), %ymm2 {%k1}
+; SKX_32-NEXT:    vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
 ; SKX_32-NEXT:    vmovapd %ymm2, %ymm0
 ; SKX_32-NEXT:    retl
 
@@ -1017,9 +917,8 @@ define <2 x double> @test17(double* %bas
 ; KNL_64-LABEL: test17:
 ; KNL_64:       # BB#0:
 ; KNL_64-NEXT:    # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; KNL_64-NEXT:    vmovdqa %xmm1, %xmm1
-; KNL_64-NEXT:    vpsllq $32, %xmm0, %xmm0
-; KNL_64-NEXT:    vpsraq $32, %zmm0, %zmm0
 ; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_64-NEXT:    vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
@@ -1030,10 +929,9 @@ define <2 x double> @test17(double* %bas
 ; KNL_32-LABEL: test17:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; KNL_32-NEXT:    vmovdqa %xmm1, %xmm1
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpsllq $32, %xmm0, %xmm0
-; KNL_32-NEXT:    vpsraq $32, %zmm0, %zmm0
 ; KNL_32-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
@@ -1045,8 +943,6 @@ define <2 x double> @test17(double* %bas
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; SKX-NEXT:    vptestmq %xmm1, %xmm1, %k1
-; SKX-NEXT:    vpsllq $32, %xmm0, %xmm0
-; SKX-NEXT:    vpsraq $32, %xmm0, %xmm0
 ; SKX-NEXT:    vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
 ; SKX-NEXT:    vmovapd %xmm2, %xmm0
 ; SKX-NEXT:    retq
@@ -1056,8 +952,6 @@ define <2 x double> @test17(double* %bas
 ; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; SKX_32-NEXT:    vptestmq %xmm1, %xmm1, %k1
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vpsllq $32, %xmm0, %xmm0
-; SKX_32-NEXT:    vpsraq $32, %xmm0, %xmm0
 ; SKX_32-NEXT:    vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1}
 ; SKX_32-NEXT:    vmovapd %xmm2, %xmm0
 ; SKX_32-NEXT:    retl
@@ -1268,10 +1162,10 @@ define <2 x float> @test22(float* %base,
 ; KNL_64-LABEL: test22:
 ; KNL_64:       # BB#0:
 ; KNL_64-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
+; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; KNL_64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
 ; KNL_64-NEXT:    vmovaps %xmm1, %xmm1
-; KNL_64-NEXT:    vpsllq $32, %xmm0, %xmm0
-; KNL_64-NEXT:    vpsraq $32, %zmm0, %zmm0
+; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; KNL_64-NEXT:    vpslld $31, %ymm1, %ymm1
 ; KNL_64-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL_64-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
@@ -1282,11 +1176,11 @@ define <2 x float> @test22(float* %base,
 ; KNL_32-LABEL: test22:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
+; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; KNL_32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
 ; KNL_32-NEXT:    vmovaps %xmm1, %xmm1
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpsllq $32, %xmm0, %xmm0
-; KNL_32-NEXT:    vpsraq $32, %zmm0, %zmm0
+; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; KNL_32-NEXT:    vpslld $31, %ymm1, %ymm1
 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
@@ -1296,22 +1190,20 @@ define <2 x float> @test22(float* %base,
 ;
 ; SKX-LABEL: test22:
 ; SKX:       # BB#0:
+; SKX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; SKX-NEXT:    vptestmq %xmm1, %xmm1, %k1
-; SKX-NEXT:    vpsllq $32, %xmm0, %xmm0
-; SKX-NEXT:    vpsraq $32, %xmm0, %xmm0
-; SKX-NEXT:    vgatherqps (%rdi,%xmm0,4), %xmm2 {%k1}
+; SKX-NEXT:    vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
 ; SKX-NEXT:    vmovaps %xmm2, %xmm0
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test22:
 ; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; SKX_32-NEXT:    vptestmq %xmm1, %xmm1, %k1
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vpsllq $32, %xmm0, %xmm0
-; SKX_32-NEXT:    vpsraq $32, %xmm0, %xmm0
-; SKX_32-NEXT:    vgatherqps (%eax,%xmm0,4), %xmm2 {%k1}
+; SKX_32-NEXT:    vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
 ; SKX_32-NEXT:    vmovaps %xmm2, %xmm0
 ; SKX_32-NEXT:    retl
   %sext_ind = sext <2 x i32> %ind to <2 x i64>
@@ -1376,9 +1268,8 @@ define <2 x i32> @test23(i32* %base, <2
 ; KNL_64-LABEL: test23:
 ; KNL_64:       # BB#0:
 ; KNL_64-NEXT:    # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; KNL_64-NEXT:    vmovdqa %xmm1, %xmm1
-; KNL_64-NEXT:    vpsllq $32, %xmm0, %xmm0
-; KNL_64-NEXT:    vpsraq $32, %zmm0, %zmm0
 ; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
@@ -1389,10 +1280,9 @@ define <2 x i32> @test23(i32* %base, <2
 ; KNL_32-LABEL: test23:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; KNL_32-NEXT:    vmovdqa %xmm1, %xmm1
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpsllq $32, %xmm0, %xmm0
-; KNL_32-NEXT:    vpsraq $32, %zmm0, %zmm0
 ; KNL_32-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
@@ -1404,8 +1294,6 @@ define <2 x i32> @test23(i32* %base, <2
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; SKX-NEXT:    vptestmq %xmm1, %xmm1, %k1
-; SKX-NEXT:    vpsllq $32, %xmm0, %xmm0
-; SKX-NEXT:    vpsraq $32, %xmm0, %xmm0
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
 ; SKX-NEXT:    vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1}
 ; SKX-NEXT:    vpmovsxdq %xmm1, %xmm0
@@ -1416,8 +1304,6 @@ define <2 x i32> @test23(i32* %base, <2
 ; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; SKX_32-NEXT:    vptestmq %xmm1, %xmm1, %k1
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vpsllq $32, %xmm0, %xmm0
-; SKX_32-NEXT:    vpsraq $32, %xmm0, %xmm0
 ; SKX_32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
 ; SKX_32-NEXT:    vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1}
 ; SKX_32-NEXT:    vpmovsxdq %xmm1, %xmm0
@@ -1431,32 +1317,28 @@ define <2 x i32> @test23(i32* %base, <2
 define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
 ; KNL_64-LABEL: test24:
 ; KNL_64:       # BB#0:
-; KNL_64-NEXT:    vpsllq $32, %xmm0, %xmm0
-; KNL_64-NEXT:    vpsraq $32, %zmm0, %zmm1
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; KNL_64-NEXT:    movb $3, %al
 ; KNL_64-NEXT:    kmovw %eax, %k1
-; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm1,8), %zmm0 {%k1}
-; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
+; KNL_64-NEXT:    vmovdqa %xmm1, %xmm0
 ; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test24:
 ; KNL_32:       # BB#0:
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpsllq $32, %xmm0, %xmm0
-; KNL_32-NEXT:    vpsraq $32, %zmm0, %zmm1
-; KNL_32-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,0,1,0]
-; KNL_32-NEXT:    vpsllq $63, %zmm0, %zmm0
-; KNL_32-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; KNL_32-NEXT:    vpgatherqq (%eax,%zmm1,8), %zmm0 {%k1}
-; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL_32-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,0,1,0]
+; KNL_32-NEXT:    vpsllq $63, %zmm1, %zmm1
+; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
+; KNL_32-NEXT:    vmovdqa %xmm1, %xmm0
 ; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test24:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpsllq $32, %xmm0, %xmm0
-; SKX-NEXT:    vpsraq $32, %xmm0, %xmm0
 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
 ; SKX-NEXT:    vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1}
 ; SKX-NEXT:    vpmovsxdq %xmm1, %xmm0
@@ -1465,8 +1347,6 @@ define <2 x i32> @test24(i32* %base, <2
 ; SKX_32-LABEL: test24:
 ; SKX_32:       # BB#0:
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vpsllq $32, %xmm0, %xmm0
-; SKX_32-NEXT:    vpsraq $32, %xmm0, %xmm0
 ; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
 ; SKX_32-NEXT:    vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1}
 ; SKX_32-NEXT:    vpmovsxdq %xmm1, %xmm0
@@ -1481,9 +1361,8 @@ define <2 x i64> @test25(i64* %base, <2
 ; KNL_64-LABEL: test25:
 ; KNL_64:       # BB#0:
 ; KNL_64-NEXT:    # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; KNL_64-NEXT:    vmovdqa %xmm1, %xmm1
-; KNL_64-NEXT:    vpsllq $32, %xmm0, %xmm0
-; KNL_64-NEXT:    vpsraq $32, %zmm0, %zmm0
 ; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
@@ -1494,10 +1373,9 @@ define <2 x i64> @test25(i64* %base, <2
 ; KNL_32-LABEL: test25:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; KNL_32-NEXT:    vmovdqa %xmm1, %xmm1
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpsllq $32, %xmm0, %xmm0
-; KNL_32-NEXT:    vpsraq $32, %zmm0, %zmm0
 ; KNL_32-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
@@ -1509,8 +1387,6 @@ define <2 x i64> @test25(i64* %base, <2
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; SKX-NEXT:    vptestmq %xmm1, %xmm1, %k1
-; SKX-NEXT:    vpsllq $32, %xmm0, %xmm0
-; SKX-NEXT:    vpsraq $32, %xmm0, %xmm0
 ; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
 ; SKX-NEXT:    vmovdqa %xmm2, %xmm0
 ; SKX-NEXT:    retq
@@ -1520,8 +1396,6 @@ define <2 x i64> @test25(i64* %base, <2
 ; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; SKX_32-NEXT:    vptestmq %xmm1, %xmm1, %k1
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vpsllq $32, %xmm0, %xmm0
-; SKX_32-NEXT:    vpsraq $32, %xmm0, %xmm0
 ; SKX_32-NEXT:    vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
 ; SKX_32-NEXT:    vmovdqa %xmm2, %xmm0
 ; SKX_32-NEXT:    retl
@@ -1535,8 +1409,7 @@ define <2 x i64> @test26(i64* %base, <2
 ; KNL_64-LABEL: test26:
 ; KNL_64:       # BB#0:
 ; KNL_64-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; KNL_64-NEXT:    vpsllq $32, %xmm0, %xmm0
-; KNL_64-NEXT:    vpsraq $32, %zmm0, %zmm0
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; KNL_64-NEXT:    movb $3, %al
 ; KNL_64-NEXT:    kmovw %eax, %k1
 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
@@ -1547,9 +1420,8 @@ define <2 x i64> @test26(i64* %base, <2
 ; KNL_32-LABEL: test26:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpsllq $32, %xmm0, %xmm0
-; KNL_32-NEXT:    vpsraq $32, %zmm0, %zmm0
 ; KNL_32-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,1,0]
 ; KNL_32-NEXT:    vpsllq $63, %zmm2, %zmm2
 ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
@@ -1560,8 +1432,6 @@ define <2 x i64> @test26(i64* %base, <2
 ;
 ; SKX-LABEL: test26:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpsllq $32, %xmm0, %xmm0
-; SKX-NEXT:    vpsraq $32, %xmm0, %xmm0
 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
 ; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0
@@ -1570,8 +1440,6 @@ define <2 x i64> @test26(i64* %base, <2
 ; SKX_32-LABEL: test26:
 ; SKX_32:       # BB#0:
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vpsllq $32, %xmm0, %xmm0
-; SKX_32-NEXT:    vpsraq $32, %xmm0, %xmm0
 ; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
 ; SKX_32-NEXT:    vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
 ; SKX_32-NEXT:    vmovdqa %xmm1, %xmm0
@@ -1586,8 +1454,8 @@ define <2 x i64> @test26(i64* %base, <2
 define <2 x float> @test27(float* %base, <2 x i32> %ind) {
 ; KNL_64-LABEL: test27:
 ; KNL_64:       # BB#0:
-; KNL_64-NEXT:    vpsllq $32, %xmm0, %xmm0
-; KNL_64-NEXT:    vpsraq $32, %zmm0, %zmm1
+; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm1
 ; KNL_64-NEXT:    movb $3, %al
 ; KNL_64-NEXT:    kmovw %eax, %k1
 ; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
@@ -1597,9 +1465,9 @@ define <2 x float> @test27(float* %base,
 ;
 ; KNL_32-LABEL: test27:
 ; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpsllq $32, %xmm0, %xmm0
-; KNL_32-NEXT:    vpsraq $32, %zmm0, %zmm1
+; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
 ; KNL_32-NEXT:    movb $3, %cl
 ; KNL_32-NEXT:    kmovw %ecx, %k1
 ; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
@@ -1609,19 +1477,19 @@ define <2 x float> @test27(float* %base,
 ;
 ; SKX-LABEL: test27:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpsllq $32, %xmm0, %xmm0
-; SKX-NEXT:    vpsraq $32, %xmm0, %xmm1
-; SKX-NEXT:    kxnorw %k0, %k0, %k1
-; SKX-NEXT:    vgatherqps (%rdi,%xmm1,4), %xmm0 {%k1}
+; SKX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SKX-NEXT:    movb $3, %al
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test27:
 ; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vpsllq $32, %xmm0, %xmm0
-; SKX_32-NEXT:    vpsraq $32, %xmm0, %xmm1
-; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
-; SKX_32-NEXT:    vgatherqps (%eax,%xmm1,4), %xmm0 {%k1}
+; SKX_32-NEXT:    movb $3, %cl
+; SKX_32-NEXT:    kmovw %ecx, %k1
+; SKX_32-NEXT:    vgatherdps (%eax,%xmm1,4), %xmm0 {%k1}
 ; SKX_32-NEXT:    retl
   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
@@ -1685,56 +1553,36 @@ define void @test28(<2 x i32>%a1, <2 x i
 define <16 x float> @test29(float* %base, <16 x i32> %ind) {
 ; KNL_64-LABEL: test29:
 ; KNL_64:       # BB#0:
-; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm1
-; KNL_64-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
-; KNL_64-NEXT:    kxorw %k0, %k0, %k1
-; KNL_64-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
-; KNL_64-NEXT:    movb $44, %al
+; KNL_64-NEXT:    movw $44, %ax
 ; KNL_64-NEXT:    kmovw %eax, %k1
-; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; KNL_64-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test29:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
-; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
-; KNL_32-NEXT:    kxorw %k0, %k0, %k1
-; KNL_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
-; KNL_32-NEXT:    movb $44, %cl
+; KNL_32-NEXT:    movw $44, %cx
 ; KNL_32-NEXT:    kmovw %ecx, %k1
-; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; KNL_32-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test29:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpmovsxdq %ymm0, %zmm1
-; SKX-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; SKX-NEXT:    vpmovsxdq %ymm0, %zmm0
-; SKX-NEXT:    kxorw %k0, %k0, %k1
-; SKX-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
-; SKX-NEXT:    movb $44, %al
+; SKX-NEXT:    movw $44, %ax
 ; SKX-NEXT:    kmovw %eax, %k1
-; SKX-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; SKX-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test29:
 ; SKX_32:       # BB#0:
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vpmovsxdq %ymm0, %zmm1
-; SKX_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; SKX_32-NEXT:    vpmovsxdq %ymm0, %zmm0
-; SKX_32-NEXT:    kxorw %k0, %k0, %k1
-; SKX_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
-; SKX_32-NEXT:    movb $44, %cl
+; SKX_32-NEXT:    movw $44, %cx
 ; SKX_32-NEXT:    kmovw %ecx, %k1
-; SKX_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; SKX_32-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX_32-NEXT:    retl
 
   %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0




More information about the llvm-commits mailing list