[llvm] r343273 - [ScalarizeMaskedMemIntrin] When expanding masked gathers, start with the passthru vector and insert the new load results into it.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 27 14:28:59 PDT 2018


Author: ctopper
Date: Thu Sep 27 14:28:59 2018
New Revision: 343273

URL: http://llvm.org/viewvc/llvm-project?rev=343273&view=rev
Log:
[ScalarizeMaskedMemIntrin] When expanding masked gathers, start with the passthru vector and insert the new load results into it.

Previously we started with undef and did a final merge with the passthru at the end.

Modified:
    llvm/trunk/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
    llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll
    llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
    llvm/trunk/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-gather.ll

Modified: llvm/trunk/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp?rev=343273&r1=343272&r2=343273&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp (original)
+++ llvm/trunk/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp Thu Sep 27 14:28:59 2018
@@ -368,10 +368,8 @@ static void scalarizeMaskedGather(CallIn
 
   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
 
-  Value *UndefVal = UndefValue::get(VecType);
-
   // The result vector
-  Value *VResult = UndefVal;
+  Value *VResult = Src0;
   unsigned VectorWidth = VecType->getNumElements();
 
   // Shorten the way if the mask is a vector of constants.
@@ -386,28 +384,17 @@ static void scalarizeMaskedGather(CallIn
       VResult = Builder.CreateInsertElement(
           VResult, Load, Builder.getInt32(Idx), "Res" + Twine(Idx));
     }
-    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
-    CI->replaceAllUsesWith(NewI);
+    CI->replaceAllUsesWith(VResult);
     CI->eraseFromParent();
     return;
   }
 
-  PHINode *Phi = nullptr;
-  Value *PrevPhi = UndefVal;
-
   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
     // Fill the "else" block, created in the previous iteration
     //
     //  %Mask1 = extractelement <16 x i1> %Mask, i32 1
     //  br i1 %Mask1, label %cond.load, label %else
     //
-    if (Idx > 0) {
-      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
-      Phi->addIncoming(VResult, CondBlock);
-      Phi->addIncoming(PrevPhi, PrevIfBlock);
-      PrevPhi = Phi;
-      VResult = Phi;
-    }
 
     Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx),
                                                     "Mask" + Twine(Idx));
@@ -425,8 +412,9 @@ static void scalarizeMaskedGather(CallIn
                                               "Ptr" + Twine(Idx));
     LoadInst *Load =
         Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx));
-    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
-                                          "Res" + Twine(Idx));
+    Value *NewVResult = Builder.CreateInsertElement(VResult, Load,
+                                                    Builder.getInt32(Idx),
+                                                    "Res" + Twine(Idx));
 
     // Create "else" block, fill it in the next iteration
     BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
@@ -436,13 +424,14 @@ static void scalarizeMaskedGather(CallIn
     OldBr->eraseFromParent();
     PrevIfBlock = IfBlock;
     IfBlock = NewIfBlock;
+
+    PHINode *Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+    Phi->addIncoming(NewVResult, CondBlock);
+    Phi->addIncoming(VResult, PrevIfBlock);
+    VResult = Phi;
   }
 
-  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
-  Phi->addIncoming(VResult, CondBlock);
-  Phi->addIncoming(PrevPhi, PrevIfBlock);
-  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
-  CI->replaceAllUsesWith(NewI);
+  CI->replaceAllUsesWith(VResult);
   CI->eraseFromParent();
 }
 

Modified: llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll?rev=343273&r1=343272&r2=343273&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll Thu Sep 27 14:28:59 2018
@@ -30,25 +30,24 @@ define <2 x i32> @masked_gather_v2i32(<2
 ;
 ; NOGATHER-LABEL: masked_gather_v2i32:
 ; NOGATHER:       # %bb.0: # %entry
-; NOGATHER-NEXT:    vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT:    vmovdqa (%rdi), %xmm2
 ; NOGATHER-NEXT:    vpextrb $0, %xmm0, %eax
-; NOGATHER-NEXT:    # implicit-def: $xmm2
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB0_2
 ; NOGATHER-NEXT:  # %bb.1: # %cond.load
-; NOGATHER-NEXT:    vmovq %xmm3, %rax
-; NOGATHER-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT:    vmovq %xmm2, %rax
+; NOGATHER-NEXT:    movl (%rax), %eax
+; NOGATHER-NEXT:    vpinsrq $0, %rax, %xmm1, %xmm1
 ; NOGATHER-NEXT:  .LBB0_2: # %else
 ; NOGATHER-NEXT:    vpextrb $8, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB0_4
 ; NOGATHER-NEXT:  # %bb.3: # %cond.load1
-; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT:    vpextrq $1, %xmm2, %rax
 ; NOGATHER-NEXT:    movl (%rax), %eax
-; NOGATHER-NEXT:    vpinsrq $1, %rax, %xmm2, %xmm2
+; NOGATHER-NEXT:    vpinsrq $1, %rax, %xmm1, %xmm1
 ; NOGATHER-NEXT:  .LBB0_4: # %else2
-; NOGATHER-NEXT:    vpsllq $63, %xmm0, %xmm0
-; NOGATHER-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; NOGATHER-NEXT:    vmovdqa %xmm1, %xmm0
 ; NOGATHER-NEXT:    retq
 entry:
   %ld  = load <2 x i32*>, <2 x i32*>* %ptr
@@ -80,26 +79,24 @@ define <4 x i32> @masked_gather_v2i32_co
 ;
 ; NOGATHER-LABEL: masked_gather_v2i32_concat:
 ; NOGATHER:       # %bb.0: # %entry
-; NOGATHER-NEXT:    vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT:    vmovdqa (%rdi), %xmm2
 ; NOGATHER-NEXT:    vpextrb $0, %xmm0, %eax
-; NOGATHER-NEXT:    # implicit-def: $xmm2
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB1_2
 ; NOGATHER-NEXT:  # %bb.1: # %cond.load
-; NOGATHER-NEXT:    vmovq %xmm3, %rax
-; NOGATHER-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT:    vmovq %xmm2, %rax
+; NOGATHER-NEXT:    movl (%rax), %eax
+; NOGATHER-NEXT:    vpinsrq $0, %rax, %xmm1, %xmm1
 ; NOGATHER-NEXT:  .LBB1_2: # %else
 ; NOGATHER-NEXT:    vpextrb $8, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB1_4
 ; NOGATHER-NEXT:  # %bb.3: # %cond.load1
-; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT:    vpextrq $1, %xmm2, %rax
 ; NOGATHER-NEXT:    movl (%rax), %eax
-; NOGATHER-NEXT:    vpinsrq $1, %rax, %xmm2, %xmm2
+; NOGATHER-NEXT:    vpinsrq $1, %rax, %xmm1, %xmm1
 ; NOGATHER-NEXT:  .LBB1_4: # %else2
-; NOGATHER-NEXT:    vpsllq $63, %xmm0, %xmm0
-; NOGATHER-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
-; NOGATHER-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; NOGATHER-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; NOGATHER-NEXT:    retq
 entry:
   %ld  = load <2 x i32*>, <2 x i32*>* %ptr
@@ -132,25 +129,23 @@ define <2 x float> @masked_gather_v2floa
 ;
 ; NOGATHER-LABEL: masked_gather_v2float:
 ; NOGATHER:       # %bb.0: # %entry
-; NOGATHER-NEXT:    vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT:    vmovdqa (%rdi), %xmm2
 ; NOGATHER-NEXT:    vpextrb $0, %xmm0, %eax
-; NOGATHER-NEXT:    # implicit-def: $xmm2
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB2_2
 ; NOGATHER-NEXT:  # %bb.1: # %cond.load
-; NOGATHER-NEXT:    vmovq %xmm3, %rax
-; NOGATHER-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT:    vmovq %xmm2, %rax
+; NOGATHER-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; NOGATHER-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
 ; NOGATHER-NEXT:  .LBB2_2: # %else
 ; NOGATHER-NEXT:    vpextrb $8, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB2_4
 ; NOGATHER-NEXT:  # %bb.3: # %cond.load1
-; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
-; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; NOGATHER-NEXT:    vpextrq $1, %xmm2, %rax
+; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
 ; NOGATHER-NEXT:  .LBB2_4: # %else2
-; NOGATHER-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; NOGATHER-NEXT:    vpslld $31, %xmm0, %xmm0
-; NOGATHER-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; NOGATHER-NEXT:    vmovaps %xmm1, %xmm0
 ; NOGATHER-NEXT:    retq
 entry:
   %ld  = load <2 x float*>, <2 x float*>* %ptr
@@ -180,25 +175,23 @@ define <4 x float> @masked_gather_v2floa
 ;
 ; NOGATHER-LABEL: masked_gather_v2float_concat:
 ; NOGATHER:       # %bb.0: # %entry
-; NOGATHER-NEXT:    vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT:    vmovdqa (%rdi), %xmm2
 ; NOGATHER-NEXT:    vpextrb $0, %xmm0, %eax
-; NOGATHER-NEXT:    # implicit-def: $xmm2
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB3_2
 ; NOGATHER-NEXT:  # %bb.1: # %cond.load
-; NOGATHER-NEXT:    vmovq %xmm3, %rax
-; NOGATHER-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT:    vmovq %xmm2, %rax
+; NOGATHER-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; NOGATHER-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
 ; NOGATHER-NEXT:  .LBB3_2: # %else
 ; NOGATHER-NEXT:    vpextrb $8, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB3_4
 ; NOGATHER-NEXT:  # %bb.3: # %cond.load1
-; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
-; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; NOGATHER-NEXT:    vpextrq $1, %xmm2, %rax
+; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
 ; NOGATHER-NEXT:  .LBB3_4: # %else2
-; NOGATHER-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; NOGATHER-NEXT:    vpslld $31, %xmm0, %xmm0
-; NOGATHER-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; NOGATHER-NEXT:    vmovaps %xmm1, %xmm0
 ; NOGATHER-NEXT:    retq
 entry:
   %ld  = load <2 x float*>, <2 x float*>* %ptr
@@ -229,27 +222,26 @@ define <4 x i32> @masked_gather_v4i32(<4
 ; NOGATHER-LABEL: masked_gather_v4i32:
 ; NOGATHER:       # %bb.0: # %entry
 ; NOGATHER-NEXT:    vpextrb $0, %xmm1, %eax
-; NOGATHER-NEXT:    # implicit-def: $xmm3
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB4_2
 ; NOGATHER-NEXT:  # %bb.1: # %cond.load
 ; NOGATHER-NEXT:    vmovq %xmm0, %rax
-; NOGATHER-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; NOGATHER-NEXT:    vpinsrd $0, (%rax), %xmm2, %xmm2
 ; NOGATHER-NEXT:  .LBB4_2: # %else
 ; NOGATHER-NEXT:    vpextrb $4, %xmm1, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB4_4
 ; NOGATHER-NEXT:  # %bb.3: # %cond.load1
 ; NOGATHER-NEXT:    vpextrq $1, %xmm0, %rax
-; NOGATHER-NEXT:    vpinsrd $1, (%rax), %xmm3, %xmm3
+; NOGATHER-NEXT:    vpinsrd $1, (%rax), %xmm2, %xmm2
 ; NOGATHER-NEXT:  .LBB4_4: # %else2
 ; NOGATHER-NEXT:    vpextrb $8, %xmm1, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB4_6
 ; NOGATHER-NEXT:  # %bb.5: # %cond.load4
-; NOGATHER-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; NOGATHER-NEXT:    vmovq %xmm4, %rax
-; NOGATHER-NEXT:    vpinsrd $2, (%rax), %xmm3, %xmm3
+; NOGATHER-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; NOGATHER-NEXT:    vmovq %xmm3, %rax
+; NOGATHER-NEXT:    vpinsrd $2, (%rax), %xmm2, %xmm2
 ; NOGATHER-NEXT:  .LBB4_6: # %else5
 ; NOGATHER-NEXT:    vpextrb $12, %xmm1, %eax
 ; NOGATHER-NEXT:    testb $1, %al
@@ -257,10 +249,9 @@ define <4 x i32> @masked_gather_v4i32(<4
 ; NOGATHER-NEXT:  # %bb.7: # %cond.load7
 ; NOGATHER-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; NOGATHER-NEXT:    vpextrq $1, %xmm0, %rax
-; NOGATHER-NEXT:    vpinsrd $3, (%rax), %xmm3, %xmm3
+; NOGATHER-NEXT:    vpinsrd $3, (%rax), %xmm2, %xmm2
 ; NOGATHER-NEXT:  .LBB4_8: # %else8
-; NOGATHER-NEXT:    vpslld $31, %xmm1, %xmm0
-; NOGATHER-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; NOGATHER-NEXT:    vmovdqa %xmm2, %xmm0
 ; NOGATHER-NEXT:    vzeroupper
 ; NOGATHER-NEXT:    retq
 entry:
@@ -289,27 +280,27 @@ define <4 x float> @masked_gather_v4floa
 ; NOGATHER-LABEL: masked_gather_v4float:
 ; NOGATHER:       # %bb.0: # %entry
 ; NOGATHER-NEXT:    vpextrb $0, %xmm1, %eax
-; NOGATHER-NEXT:    # implicit-def: $xmm3
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB5_2
 ; NOGATHER-NEXT:  # %bb.1: # %cond.load
 ; NOGATHER-NEXT:    vmovq %xmm0, %rax
 ; NOGATHER-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; NOGATHER-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
 ; NOGATHER-NEXT:  .LBB5_2: # %else
 ; NOGATHER-NEXT:    vpextrb $4, %xmm1, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB5_4
 ; NOGATHER-NEXT:  # %bb.3: # %cond.load1
 ; NOGATHER-NEXT:    vpextrq $1, %xmm0, %rax
-; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
+; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
 ; NOGATHER-NEXT:  .LBB5_4: # %else2
 ; NOGATHER-NEXT:    vpextrb $8, %xmm1, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB5_6
 ; NOGATHER-NEXT:  # %bb.5: # %cond.load4
-; NOGATHER-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; NOGATHER-NEXT:    vmovq %xmm4, %rax
-; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
+; NOGATHER-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; NOGATHER-NEXT:    vmovq %xmm3, %rax
+; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
 ; NOGATHER-NEXT:  .LBB5_6: # %else5
 ; NOGATHER-NEXT:    vpextrb $12, %xmm1, %eax
 ; NOGATHER-NEXT:    testb $1, %al
@@ -317,10 +308,9 @@ define <4 x float> @masked_gather_v4floa
 ; NOGATHER-NEXT:  # %bb.7: # %cond.load7
 ; NOGATHER-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; NOGATHER-NEXT:    vpextrq $1, %xmm0, %rax
-; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
+; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
 ; NOGATHER-NEXT:  .LBB5_8: # %else8
-; NOGATHER-NEXT:    vpslld $31, %xmm1, %xmm0
-; NOGATHER-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; NOGATHER-NEXT:    vmovaps %xmm2, %xmm0
 ; NOGATHER-NEXT:    vzeroupper
 ; NOGATHER-NEXT:    retq
 entry:
@@ -357,86 +347,81 @@ define <8 x i32> @masked_gather_v8i32(<8
 ;
 ; NOGATHER-LABEL: masked_gather_v8i32:
 ; NOGATHER:       # %bb.0: # %entry
-; NOGATHER-NEXT:    vmovdqa (%rdi), %ymm4
-; NOGATHER-NEXT:    vmovdqa 32(%rdi), %ymm3
+; NOGATHER-NEXT:    vmovdqa (%rdi), %ymm3
+; NOGATHER-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; NOGATHER-NEXT:    vpextrb $0, %xmm0, %eax
-; NOGATHER-NEXT:    # implicit-def: $ymm2
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB6_2
 ; NOGATHER-NEXT:  # %bb.1: # %cond.load
-; NOGATHER-NEXT:    vmovq %xmm4, %rax
-; NOGATHER-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT:    vmovq %xmm3, %rax
+; NOGATHER-NEXT:    vpinsrd $0, (%rax), %xmm1, %xmm4
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
 ; NOGATHER-NEXT:  .LBB6_2: # %else
 ; NOGATHER-NEXT:    vpextrb $2, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB6_4
 ; NOGATHER-NEXT:  # %bb.3: # %cond.load1
-; NOGATHER-NEXT:    vpextrq $1, %xmm4, %rax
-; NOGATHER-NEXT:    vpinsrd $1, (%rax), %xmm2, %xmm5
-; NOGATHER-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT:    vpinsrd $1, (%rax), %xmm1, %xmm4
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
 ; NOGATHER-NEXT:  .LBB6_4: # %else2
 ; NOGATHER-NEXT:    vpextrb $4, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB6_6
 ; NOGATHER-NEXT:  # %bb.5: # %cond.load4
-; NOGATHER-NEXT:    vextractf128 $1, %ymm4, %xmm5
-; NOGATHER-NEXT:    vmovq %xmm5, %rax
-; NOGATHER-NEXT:    vpinsrd $2, (%rax), %xmm2, %xmm5
-; NOGATHER-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; NOGATHER-NEXT:    vmovq %xmm4, %rax
+; NOGATHER-NEXT:    vpinsrd $2, (%rax), %xmm1, %xmm4
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
 ; NOGATHER-NEXT:  .LBB6_6: # %else5
 ; NOGATHER-NEXT:    vpextrb $6, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB6_8
 ; NOGATHER-NEXT:  # %bb.7: # %cond.load7
-; NOGATHER-NEXT:    vextractf128 $1, %ymm4, %xmm4
-; NOGATHER-NEXT:    vpextrq $1, %xmm4, %rax
-; NOGATHER-NEXT:    vpinsrd $3, (%rax), %xmm2, %xmm4
-; NOGATHER-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT:    vpinsrd $3, (%rax), %xmm1, %xmm3
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
 ; NOGATHER-NEXT:  .LBB6_8: # %else8
 ; NOGATHER-NEXT:    vpextrb $8, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB6_10
 ; NOGATHER-NEXT:  # %bb.9: # %cond.load10
-; NOGATHER-NEXT:    vmovq %xmm3, %rax
-; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; NOGATHER-NEXT:    vpinsrd $0, (%rax), %xmm4, %xmm4
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT:    vmovq %xmm2, %rax
+; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; NOGATHER-NEXT:    vpinsrd $0, (%rax), %xmm3, %xmm3
+; NOGATHER-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; NOGATHER-NEXT:  .LBB6_10: # %else11
 ; NOGATHER-NEXT:    vpextrb $10, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB6_12
 ; NOGATHER-NEXT:  # %bb.11: # %cond.load13
-; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
-; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; NOGATHER-NEXT:    vpinsrd $1, (%rax), %xmm4, %xmm4
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT:    vpextrq $1, %xmm2, %rax
+; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; NOGATHER-NEXT:    vpinsrd $1, (%rax), %xmm3, %xmm3
+; NOGATHER-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; NOGATHER-NEXT:  .LBB6_12: # %else14
 ; NOGATHER-NEXT:    vpextrb $12, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB6_14
 ; NOGATHER-NEXT:  # %bb.13: # %cond.load16
-; NOGATHER-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; NOGATHER-NEXT:    vmovq %xmm4, %rax
-; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; NOGATHER-NEXT:    vpinsrd $2, (%rax), %xmm4, %xmm4
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; NOGATHER-NEXT:    vmovq %xmm3, %rax
+; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; NOGATHER-NEXT:    vpinsrd $2, (%rax), %xmm3, %xmm3
+; NOGATHER-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; NOGATHER-NEXT:  .LBB6_14: # %else17
 ; NOGATHER-NEXT:    vpextrb $14, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB6_16
 ; NOGATHER-NEXT:  # %bb.15: # %cond.load19
-; NOGATHER-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
-; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; NOGATHER-NEXT:    vpinsrd $3, (%rax), %xmm3, %xmm3
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; NOGATHER-NEXT:    vpextrq $1, %xmm0, %rax
+; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; NOGATHER-NEXT:    vpinsrd $3, (%rax), %xmm0, %xmm0
+; NOGATHER-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
 ; NOGATHER-NEXT:  .LBB6_16: # %else20
-; NOGATHER-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; NOGATHER-NEXT:    vpslld $31, %xmm3, %xmm3
-; NOGATHER-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; NOGATHER-NEXT:    vpslld $31, %xmm0, %xmm0
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; NOGATHER-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; NOGATHER-NEXT:    vmovaps %ymm1, %ymm0
 ; NOGATHER-NEXT:    retq
 entry:
   %ld  = load <8 x i32*>, <8 x i32*>* %ptr
@@ -473,87 +458,82 @@ define <8 x float> @masked_gather_v8floa
 ;
 ; NOGATHER-LABEL: masked_gather_v8float:
 ; NOGATHER:       # %bb.0: # %entry
-; NOGATHER-NEXT:    vmovdqa (%rdi), %ymm4
-; NOGATHER-NEXT:    vmovdqa 32(%rdi), %ymm3
+; NOGATHER-NEXT:    vmovdqa (%rdi), %ymm3
+; NOGATHER-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; NOGATHER-NEXT:    vpextrb $0, %xmm0, %eax
-; NOGATHER-NEXT:    # implicit-def: $ymm2
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB7_2
 ; NOGATHER-NEXT:  # %bb.1: # %cond.load
-; NOGATHER-NEXT:    vmovq %xmm4, %rax
-; NOGATHER-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT:    vmovq %xmm3, %rax
+; NOGATHER-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7]
 ; NOGATHER-NEXT:  .LBB7_2: # %else
 ; NOGATHER-NEXT:    vpextrb $2, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB7_4
 ; NOGATHER-NEXT:  # %bb.3: # %cond.load1
-; NOGATHER-NEXT:    vpextrq $1, %xmm4, %rax
-; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm5 = xmm2[0],mem[0],xmm2[2,3]
-; NOGATHER-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm4 = xmm1[0],mem[0],xmm1[2,3]
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
 ; NOGATHER-NEXT:  .LBB7_4: # %else2
 ; NOGATHER-NEXT:    vpextrb $4, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB7_6
 ; NOGATHER-NEXT:  # %bb.5: # %cond.load4
-; NOGATHER-NEXT:    vextractf128 $1, %ymm4, %xmm5
-; NOGATHER-NEXT:    vmovq %xmm5, %rax
-; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm5 = xmm2[0,1],mem[0],xmm2[3]
-; NOGATHER-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; NOGATHER-NEXT:    vmovq %xmm4, %rax
+; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm4 = xmm1[0,1],mem[0],xmm1[3]
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
 ; NOGATHER-NEXT:  .LBB7_6: # %else5
 ; NOGATHER-NEXT:    vpextrb $6, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB7_8
 ; NOGATHER-NEXT:  # %bb.7: # %cond.load7
-; NOGATHER-NEXT:    vextractf128 $1, %ymm4, %xmm4
-; NOGATHER-NEXT:    vpextrq $1, %xmm4, %rax
-; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm4 = xmm2[0,1,2],mem[0]
-; NOGATHER-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],mem[0]
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
 ; NOGATHER-NEXT:  .LBB7_8: # %else8
 ; NOGATHER-NEXT:    vpextrb $8, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB7_10
 ; NOGATHER-NEXT:  # %bb.9: # %cond.load10
-; NOGATHER-NEXT:    vmovq %xmm3, %rax
-; NOGATHER-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; NOGATHER-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT:    vmovq %xmm2, %rax
+; NOGATHER-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; NOGATHER-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3]
+; NOGATHER-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; NOGATHER-NEXT:  .LBB7_10: # %else11
 ; NOGATHER-NEXT:    vpextrb $10, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB7_12
 ; NOGATHER-NEXT:  # %bb.11: # %cond.load13
-; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
-; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT:    vpextrq $1, %xmm2, %rax
+; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
+; NOGATHER-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; NOGATHER-NEXT:  .LBB7_12: # %else14
 ; NOGATHER-NEXT:    vpextrb $12, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB7_14
 ; NOGATHER-NEXT:  # %bb.13: # %cond.load16
-; NOGATHER-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; NOGATHER-NEXT:    vmovq %xmm4, %rax
-; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; NOGATHER-NEXT:    vmovq %xmm3, %rax
+; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
+; NOGATHER-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; NOGATHER-NEXT:  .LBB7_14: # %else17
 ; NOGATHER-NEXT:    vpextrb $14, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB7_16
 ; NOGATHER-NEXT:  # %bb.15: # %cond.load19
-; NOGATHER-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
-; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; NOGATHER-NEXT:    vpextrq $1, %xmm0, %rax
+; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; NOGATHER-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
 ; NOGATHER-NEXT:  .LBB7_16: # %else20
-; NOGATHER-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; NOGATHER-NEXT:    vpslld $31, %xmm3, %xmm3
-; NOGATHER-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; NOGATHER-NEXT:    vpslld $31, %xmm0, %xmm0
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; NOGATHER-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; NOGATHER-NEXT:    vmovaps %ymm1, %ymm0
 ; NOGATHER-NEXT:    retq
 entry:
   %ld  = load <8 x float*>, <8 x float*>* %ptr
@@ -585,50 +565,44 @@ define <4 x i64> @masked_gather_v4i64(<4
 ;
 ; NOGATHER-LABEL: masked_gather_v4i64:
 ; NOGATHER:       # %bb.0: # %entry
-; NOGATHER-NEXT:    vmovdqa (%rdi), %ymm3
+; NOGATHER-NEXT:    vmovdqa (%rdi), %ymm2
 ; NOGATHER-NEXT:    vpextrb $0, %xmm0, %eax
-; NOGATHER-NEXT:    # implicit-def: $ymm2
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB8_2
 ; NOGATHER-NEXT:  # %bb.1: # %cond.load
-; NOGATHER-NEXT:    vmovq %xmm3, %rax
-; NOGATHER-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; NOGATHER-NEXT:    vmovq %xmm2, %rax
+; NOGATHER-NEXT:    vpinsrq $0, (%rax), %xmm1, %xmm3
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
 ; NOGATHER-NEXT:  .LBB8_2: # %else
 ; NOGATHER-NEXT:    vpextrb $4, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB8_4
 ; NOGATHER-NEXT:  # %bb.3: # %cond.load1
-; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
-; NOGATHER-NEXT:    vpinsrq $1, (%rax), %xmm2, %xmm4
-; NOGATHER-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT:    vpextrq $1, %xmm2, %rax
+; NOGATHER-NEXT:    vpinsrq $1, (%rax), %xmm1, %xmm3
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
 ; NOGATHER-NEXT:  .LBB8_4: # %else2
 ; NOGATHER-NEXT:    vpextrb $8, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB8_6
 ; NOGATHER-NEXT:  # %bb.5: # %cond.load4
-; NOGATHER-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; NOGATHER-NEXT:    vmovq %xmm4, %rax
-; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; NOGATHER-NEXT:    vpinsrq $0, (%rax), %xmm4, %xmm4
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; NOGATHER-NEXT:    vmovq %xmm3, %rax
+; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; NOGATHER-NEXT:    vpinsrq $0, (%rax), %xmm3, %xmm3
+; NOGATHER-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; NOGATHER-NEXT:  .LBB8_6: # %else5
 ; NOGATHER-NEXT:    vpextrb $12, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB8_8
 ; NOGATHER-NEXT:  # %bb.7: # %cond.load7
-; NOGATHER-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
-; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; NOGATHER-NEXT:    vpinsrq $1, (%rax), %xmm3, %xmm3
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; NOGATHER-NEXT:    vpextrq $1, %xmm0, %rax
+; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; NOGATHER-NEXT:    vpinsrq $1, (%rax), %xmm0, %xmm0
+; NOGATHER-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
 ; NOGATHER-NEXT:  .LBB8_8: # %else8
-; NOGATHER-NEXT:    vpslld $31, %xmm0, %xmm0
-; NOGATHER-NEXT:    vpsrad $31, %xmm0, %xmm0
-; NOGATHER-NEXT:    vpmovsxdq %xmm0, %xmm3
-; NOGATHER-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; NOGATHER-NEXT:    vpmovsxdq %xmm0, %xmm0
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; NOGATHER-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; NOGATHER-NEXT:    vmovaps %ymm1, %ymm0
 ; NOGATHER-NEXT:    retq
 entry:
   %ld  = load <4 x i64*>, <4 x i64*>* %ptr
@@ -660,50 +634,44 @@ define <4 x double> @masked_gather_v4dou
 ;
 ; NOGATHER-LABEL: masked_gather_v4double:
 ; NOGATHER:       # %bb.0: # %entry
-; NOGATHER-NEXT:    vmovdqa (%rdi), %ymm3
+; NOGATHER-NEXT:    vmovdqa (%rdi), %ymm2
 ; NOGATHER-NEXT:    vpextrb $0, %xmm0, %eax
-; NOGATHER-NEXT:    # implicit-def: $ymm2
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB9_2
 ; NOGATHER-NEXT:  # %bb.1: # %cond.load
-; NOGATHER-NEXT:    vmovq %xmm3, %rax
-; NOGATHER-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; NOGATHER-NEXT:    vmovq %xmm2, %rax
+; NOGATHER-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; NOGATHER-NEXT:    vblendpd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3]
 ; NOGATHER-NEXT:  .LBB9_2: # %else
 ; NOGATHER-NEXT:    vpextrb $4, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB9_4
 ; NOGATHER-NEXT:  # %bb.3: # %cond.load1
-; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
-; NOGATHER-NEXT:    vmovhpd {{.*#+}} xmm4 = xmm2[0],mem[0]
-; NOGATHER-NEXT:    vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
+; NOGATHER-NEXT:    vpextrq $1, %xmm2, %rax
+; NOGATHER-NEXT:    vmovhpd {{.*#+}} xmm3 = xmm1[0],mem[0]
+; NOGATHER-NEXT:    vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3]
 ; NOGATHER-NEXT:  .LBB9_4: # %else2
 ; NOGATHER-NEXT:    vpextrb $8, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB9_6
 ; NOGATHER-NEXT:  # %bb.5: # %cond.load4
-; NOGATHER-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; NOGATHER-NEXT:    vmovq %xmm4, %rax
-; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; NOGATHER-NEXT:    vmovlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; NOGATHER-NEXT:    vmovq %xmm3, %rax
+; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; NOGATHER-NEXT:    vmovlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
+; NOGATHER-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; NOGATHER-NEXT:  .LBB9_6: # %else5
 ; NOGATHER-NEXT:    vpextrb $12, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB9_8
 ; NOGATHER-NEXT:  # %bb.7: # %cond.load7
-; NOGATHER-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
-; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; NOGATHER-NEXT:    vmovhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; NOGATHER-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; NOGATHER-NEXT:    vpextrq $1, %xmm0, %rax
+; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; NOGATHER-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; NOGATHER-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
 ; NOGATHER-NEXT:  .LBB9_8: # %else8
-; NOGATHER-NEXT:    vpslld $31, %xmm0, %xmm0
-; NOGATHER-NEXT:    vpsrad $31, %xmm0, %xmm0
-; NOGATHER-NEXT:    vpmovsxdq %xmm0, %xmm3
-; NOGATHER-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; NOGATHER-NEXT:    vpmovsxdq %xmm0, %xmm0
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; NOGATHER-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; NOGATHER-NEXT:    vmovapd %ymm1, %ymm0
 ; NOGATHER-NEXT:    retq
 entry:
   %ld  = load <4 x double*>, <4 x double*>* %ptr
@@ -733,24 +701,22 @@ define <2 x i64> @masked_gather_v2i64(<2
 ;
 ; NOGATHER-LABEL: masked_gather_v2i64:
 ; NOGATHER:       # %bb.0: # %entry
-; NOGATHER-NEXT:    vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT:    vmovdqa (%rdi), %xmm2
 ; NOGATHER-NEXT:    vpextrb $0, %xmm0, %eax
-; NOGATHER-NEXT:    # implicit-def: $xmm2
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB10_2
 ; NOGATHER-NEXT:  # %bb.1: # %cond.load
-; NOGATHER-NEXT:    vmovq %xmm3, %rax
-; NOGATHER-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; NOGATHER-NEXT:    vmovq %xmm2, %rax
+; NOGATHER-NEXT:    vpinsrq $0, (%rax), %xmm1, %xmm1
 ; NOGATHER-NEXT:  .LBB10_2: # %else
 ; NOGATHER-NEXT:    vpextrb $8, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB10_4
 ; NOGATHER-NEXT:  # %bb.3: # %cond.load1
-; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
-; NOGATHER-NEXT:    vpinsrq $1, (%rax), %xmm2, %xmm2
+; NOGATHER-NEXT:    vpextrq $1, %xmm2, %rax
+; NOGATHER-NEXT:    vpinsrq $1, (%rax), %xmm1, %xmm1
 ; NOGATHER-NEXT:  .LBB10_4: # %else2
-; NOGATHER-NEXT:    vpsllq $63, %xmm0, %xmm0
-; NOGATHER-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; NOGATHER-NEXT:    vmovdqa %xmm1, %xmm0
 ; NOGATHER-NEXT:    retq
 entry:
   %ld  = load <2 x i64*>, <2 x i64*>* %ptr
@@ -780,24 +746,22 @@ define <2 x double> @masked_gather_v2dou
 ;
 ; NOGATHER-LABEL: masked_gather_v2double:
 ; NOGATHER:       # %bb.0: # %entry
-; NOGATHER-NEXT:    vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT:    vmovdqa (%rdi), %xmm2
 ; NOGATHER-NEXT:    vpextrb $0, %xmm0, %eax
-; NOGATHER-NEXT:    # implicit-def: $xmm2
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB11_2
 ; NOGATHER-NEXT:  # %bb.1: # %cond.load
-; NOGATHER-NEXT:    vmovq %xmm3, %rax
-; NOGATHER-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; NOGATHER-NEXT:    vmovq %xmm2, %rax
+; NOGATHER-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
 ; NOGATHER-NEXT:  .LBB11_2: # %else
 ; NOGATHER-NEXT:    vpextrb $8, %xmm0, %eax
 ; NOGATHER-NEXT:    testb $1, %al
 ; NOGATHER-NEXT:    je .LBB11_4
 ; NOGATHER-NEXT:  # %bb.3: # %cond.load1
-; NOGATHER-NEXT:    vpextrq $1, %xmm3, %rax
-; NOGATHER-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; NOGATHER-NEXT:    vpextrq $1, %xmm2, %rax
+; NOGATHER-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; NOGATHER-NEXT:  .LBB11_4: # %else2
-; NOGATHER-NEXT:    vpsllq $63, %xmm0, %xmm0
-; NOGATHER-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; NOGATHER-NEXT:    vmovapd %xmm1, %xmm0
 ; NOGATHER-NEXT:    retq
 entry:
   %ld  = load <2 x double*>, <2 x double*>* %ptr

Modified: llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll?rev=343273&r1=343272&r2=343273&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll Thu Sep 27 14:28:59 2018
@@ -1658,38 +1658,35 @@ declare <3 x i32> @llvm.masked.gather.v3
 define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
 ; KNL_64-LABEL: test30:
 ; KNL_64:       # %bb.0:
-; KNL_64-NEXT:    # kill: def $xmm3 killed $xmm3 def $zmm3
 ; KNL_64-NEXT:    vpslld $31, %xmm2, %xmm2
-; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
-; KNL_64-NEXT:    kmovw %k1, %eax
+; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; KNL_64-NEXT:    kmovw %k0, %eax
 ; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
 ; KNL_64-NEXT:    vpsllq $2, %ymm1, %ymm1
-; KNL_64-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; KNL_64-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; KNL_64-NEXT:    testb $1, %al
-; KNL_64-NEXT:    # implicit-def: $xmm0
 ; KNL_64-NEXT:    je .LBB31_2
 ; KNL_64-NEXT:  # %bb.1: # %cond.load
-; KNL_64-NEXT:    vmovq %xmm1, %rax
-; KNL_64-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL_64-NEXT:    vmovq %xmm0, %rax
+; KNL_64-NEXT:    vpinsrd $0, (%rax), %xmm3, %xmm3
 ; KNL_64-NEXT:  .LBB31_2: # %else
-; KNL_64-NEXT:    kshiftrw $1, %k1, %k0
-; KNL_64-NEXT:    kmovw %k0, %eax
+; KNL_64-NEXT:    kshiftrw $1, %k0, %k1
+; KNL_64-NEXT:    kmovw %k1, %eax
 ; KNL_64-NEXT:    testb $1, %al
 ; KNL_64-NEXT:    je .LBB31_4
 ; KNL_64-NEXT:  # %bb.3: # %cond.load1
-; KNL_64-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL_64-NEXT:    vpinsrd $1, (%rax), %xmm0, %xmm0
+; KNL_64-NEXT:    vpextrq $1, %xmm0, %rax
+; KNL_64-NEXT:    vpinsrd $1, (%rax), %xmm3, %xmm3
 ; KNL_64-NEXT:  .LBB31_4: # %else2
-; KNL_64-NEXT:    kshiftrw $2, %k1, %k0
+; KNL_64-NEXT:    kshiftrw $2, %k0, %k0
 ; KNL_64-NEXT:    kmovw %k0, %eax
 ; KNL_64-NEXT:    testb $1, %al
 ; KNL_64-NEXT:    je .LBB31_6
 ; KNL_64-NEXT:  # %bb.5: # %cond.load4
-; KNL_64-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; KNL_64-NEXT:    vmovq %xmm1, %rax
-; KNL_64-NEXT:    vpinsrd $2, (%rax), %xmm0, %xmm0
+; KNL_64-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL_64-NEXT:    vmovq %xmm0, %rax
+; KNL_64-NEXT:    vpinsrd $2, (%rax), %xmm3, %xmm3
 ; KNL_64-NEXT:  .LBB31_6: # %else5
-; KNL_64-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
 ; KNL_64-NEXT:    vmovdqa %xmm3, %xmm0
 ; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
@@ -1698,37 +1695,35 @@ define <3 x i32> @test30(<3 x i32*> %bas
 ; KNL_32:       # %bb.0:
 ; KNL_32-NEXT:    subl $12, %esp
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 16
-; KNL_32-NEXT:    vpslld $31, %xmm2, %xmm2
-; KNL_32-NEXT:    vptestmd %zmm2, %zmm2, %k1
-; KNL_32-NEXT:    kmovw %k1, %eax
+; KNL_32-NEXT:    vmovdqa %xmm0, %xmm3
+; KNL_32-NEXT:    vpslld $31, %xmm2, %xmm0
+; KNL_32-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL_32-NEXT:    kmovw %k0, %eax
+; KNL_32-NEXT:    vmovdqa {{[0-9]+}}(%esp), %xmm0
 ; KNL_32-NEXT:    vpslld $2, %xmm1, %xmm1
-; KNL_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; KNL_32-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
 ; KNL_32-NEXT:    testb $1, %al
-; KNL_32-NEXT:    # implicit-def: $xmm1
 ; KNL_32-NEXT:    je .LBB31_2
 ; KNL_32-NEXT:  # %bb.1: # %cond.load
-; KNL_32-NEXT:    vmovd %xmm2, %eax
-; KNL_32-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; KNL_32-NEXT:    vmovd %xmm1, %eax
+; KNL_32-NEXT:    vpinsrd $0, (%eax), %xmm0, %xmm0
 ; KNL_32-NEXT:  .LBB31_2: # %else
-; KNL_32-NEXT:    kshiftrw $1, %k1, %k0
-; KNL_32-NEXT:    kmovw %k0, %eax
+; KNL_32-NEXT:    kshiftrw $1, %k0, %k1
+; KNL_32-NEXT:    kmovw %k1, %eax
 ; KNL_32-NEXT:    testb $1, %al
 ; KNL_32-NEXT:    je .LBB31_4
 ; KNL_32-NEXT:  # %bb.3: # %cond.load1
-; KNL_32-NEXT:    vpextrd $1, %xmm2, %eax
-; KNL_32-NEXT:    vpinsrd $1, (%eax), %xmm1, %xmm1
+; KNL_32-NEXT:    vpextrd $1, %xmm1, %eax
+; KNL_32-NEXT:    vpinsrd $1, (%eax), %xmm0, %xmm0
 ; KNL_32-NEXT:  .LBB31_4: # %else2
-; KNL_32-NEXT:    vmovdqa {{[0-9]+}}(%esp), %xmm0
-; KNL_32-NEXT:    kshiftrw $2, %k1, %k0
+; KNL_32-NEXT:    kshiftrw $2, %k0, %k0
 ; KNL_32-NEXT:    kmovw %k0, %eax
 ; KNL_32-NEXT:    testb $1, %al
 ; KNL_32-NEXT:    je .LBB31_6
 ; KNL_32-NEXT:  # %bb.5: # %cond.load4
-; KNL_32-NEXT:    vpextrd $2, %xmm2, %eax
-; KNL_32-NEXT:    vpinsrd $2, (%eax), %xmm1, %xmm1
+; KNL_32-NEXT:    vpextrd $2, %xmm1, %eax
+; KNL_32-NEXT:    vpinsrd $2, (%eax), %xmm0, %xmm0
 ; KNL_32-NEXT:  .LBB31_6: # %else5
-; KNL_32-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; KNL_32-NEXT:    addl $12, %esp
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 4
 ; KNL_32-NEXT:    vzeroupper
@@ -1737,36 +1732,34 @@ define <3 x i32> @test30(<3 x i32*> %bas
 ; SKX-LABEL: test30:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpslld $31, %xmm2, %xmm2
-; SKX-NEXT:    vpmovd2m %xmm2, %k1
-; SKX-NEXT:    kmovw %k1, %eax
+; SKX-NEXT:    vpmovd2m %xmm2, %k0
+; SKX-NEXT:    kmovw %k0, %eax
 ; SKX-NEXT:    vpmovsxdq %xmm1, %ymm1
 ; SKX-NEXT:    vpsllq $2, %ymm1, %ymm1
-; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; SKX-NEXT:    testb $1, %al
-; SKX-NEXT:    # implicit-def: $xmm0
 ; SKX-NEXT:    je .LBB31_2
 ; SKX-NEXT:  # %bb.1: # %cond.load
-; SKX-NEXT:    vmovq %xmm1, %rax
-; SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT:    vmovq %xmm0, %rax
+; SKX-NEXT:    vpinsrd $0, (%rax), %xmm3, %xmm3
 ; SKX-NEXT:  .LBB31_2: # %else
-; SKX-NEXT:    kshiftrw $1, %k1, %k0
-; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    kshiftrw $1, %k0, %k1
+; SKX-NEXT:    kmovw %k1, %eax
 ; SKX-NEXT:    testb $1, %al
 ; SKX-NEXT:    je .LBB31_4
 ; SKX-NEXT:  # %bb.3: # %cond.load1
-; SKX-NEXT:    vpextrq $1, %xmm1, %rax
-; SKX-NEXT:    vpinsrd $1, (%rax), %xmm0, %xmm0
+; SKX-NEXT:    vpextrq $1, %xmm0, %rax
+; SKX-NEXT:    vpinsrd $1, (%rax), %xmm3, %xmm3
 ; SKX-NEXT:  .LBB31_4: # %else2
-; SKX-NEXT:    kshiftrw $2, %k1, %k0
+; SKX-NEXT:    kshiftrw $2, %k0, %k0
 ; SKX-NEXT:    kmovw %k0, %eax
 ; SKX-NEXT:    testb $1, %al
 ; SKX-NEXT:    je .LBB31_6
 ; SKX-NEXT:  # %bb.5: # %cond.load4
-; SKX-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; SKX-NEXT:    vmovq %xmm1, %rax
-; SKX-NEXT:    vpinsrd $2, (%rax), %xmm0, %xmm0
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT:    vmovq %xmm0, %rax
+; SKX-NEXT:    vpinsrd $2, (%rax), %xmm3, %xmm3
 ; SKX-NEXT:  .LBB31_6: # %else5
-; SKX-NEXT:    vmovdqa32 %xmm0, %xmm3 {%k1}
 ; SKX-NEXT:    vmovdqa %xmm3, %xmm0
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
@@ -1775,36 +1768,35 @@ define <3 x i32> @test30(<3 x i32*> %bas
 ; SKX_32:       # %bb.0:
 ; SKX_32-NEXT:    subl $12, %esp
 ; SKX_32-NEXT:    .cfi_def_cfa_offset 16
-; SKX_32-NEXT:    vpslld $31, %xmm2, %xmm2
-; SKX_32-NEXT:    vpmovd2m %xmm2, %k1
-; SKX_32-NEXT:    kmovw %k1, %eax
+; SKX_32-NEXT:    vmovdqa %xmm0, %xmm3
+; SKX_32-NEXT:    vpslld $31, %xmm2, %xmm0
+; SKX_32-NEXT:    vpmovd2m %xmm0, %k0
+; SKX_32-NEXT:    kmovw %k0, %eax
+; SKX_32-NEXT:    vmovdqa {{[0-9]+}}(%esp), %xmm0
 ; SKX_32-NEXT:    vpslld $2, %xmm1, %xmm1
-; SKX_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; SKX_32-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
 ; SKX_32-NEXT:    testb $1, %al
-; SKX_32-NEXT:    # implicit-def: $xmm1
 ; SKX_32-NEXT:    je .LBB31_2
 ; SKX_32-NEXT:  # %bb.1: # %cond.load
-; SKX_32-NEXT:    vmovd %xmm2, %eax
-; SKX_32-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SKX_32-NEXT:    vmovd %xmm1, %eax
+; SKX_32-NEXT:    vpinsrd $0, (%eax), %xmm0, %xmm0
 ; SKX_32-NEXT:  .LBB31_2: # %else
-; SKX_32-NEXT:    kshiftrw $1, %k1, %k0
-; SKX_32-NEXT:    kmovw %k0, %eax
+; SKX_32-NEXT:    kshiftrw $1, %k0, %k1
+; SKX_32-NEXT:    kmovw %k1, %eax
 ; SKX_32-NEXT:    testb $1, %al
 ; SKX_32-NEXT:    je .LBB31_4
 ; SKX_32-NEXT:  # %bb.3: # %cond.load1
-; SKX_32-NEXT:    vpextrd $1, %xmm2, %eax
-; SKX_32-NEXT:    vpinsrd $1, (%eax), %xmm1, %xmm1
+; SKX_32-NEXT:    vpextrd $1, %xmm1, %eax
+; SKX_32-NEXT:    vpinsrd $1, (%eax), %xmm0, %xmm0
 ; SKX_32-NEXT:  .LBB31_4: # %else2
-; SKX_32-NEXT:    vmovdqa {{[0-9]+}}(%esp), %xmm0
-; SKX_32-NEXT:    kshiftrw $2, %k1, %k0
+; SKX_32-NEXT:    kshiftrw $2, %k0, %k0
 ; SKX_32-NEXT:    kmovw %k0, %eax
 ; SKX_32-NEXT:    testb $1, %al
 ; SKX_32-NEXT:    je .LBB31_6
 ; SKX_32-NEXT:  # %bb.5: # %cond.load4
-; SKX_32-NEXT:    vpextrd $2, %xmm2, %eax
-; SKX_32-NEXT:    vpinsrd $2, (%eax), %xmm1, %xmm1
+; SKX_32-NEXT:    vpextrd $2, %xmm1, %eax
+; SKX_32-NEXT:    vpinsrd $2, (%eax), %xmm0, %xmm0
 ; SKX_32-NEXT:  .LBB31_6: # %else5
-; SKX_32-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
 ; SKX_32-NEXT:    addl $12, %esp
 ; SKX_32-NEXT:    .cfi_def_cfa_offset 4
 ; SKX_32-NEXT:    retl

Modified: llvm/trunk/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-gather.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-gather.ll?rev=343273&r1=343272&r2=343273&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-gather.ll (original)
+++ llvm/trunk/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-gather.ll Thu Sep 27 14:28:59 2018
@@ -8,10 +8,10 @@ define <2 x i64> @scalarize_v2i64(<2 x i
 ; CHECK:       cond.load:
 ; CHECK-NEXT:    [[PTR0:%.*]] = extractelement <2 x i64*> [[P:%.*]], i32 0
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, i64* [[PTR0]], align 8
-; CHECK-NEXT:    [[RES0:%.*]] = insertelement <2 x i64> undef, i64 [[LOAD0]], i32 0
+; CHECK-NEXT:    [[RES0:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[LOAD0]], i32 0
 ; CHECK-NEXT:    br label [[ELSE]]
 ; CHECK:       else:
-; CHECK-NEXT:    [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[RES0]], [[COND_LOAD]] ], [ undef, [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[RES0]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
 ; CHECK-NEXT:    [[MASK1:%.*]] = extractelement <2 x i1> [[MASK]], i32 1
 ; CHECK-NEXT:    br i1 [[MASK1]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
 ; CHECK:       cond.load1:
@@ -20,9 +20,8 @@ define <2 x i64> @scalarize_v2i64(<2 x i
 ; CHECK-NEXT:    [[RES1:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[LOAD1]], i32 1
 ; CHECK-NEXT:    br label [[ELSE2]]
 ; CHECK:       else2:
-; CHECK-NEXT:    [[RES_PHI_SELECT:%.*]] = phi <2 x i64> [ [[RES1]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[MASK]], <2 x i64> [[RES_PHI_SELECT]], <2 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[RES1]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-NEXT:    ret <2 x i64> [[RES_PHI_ELSE3]]
 ;
   %ret = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %p, i32 8, <2 x i1> %mask, <2 x i64> %passthru)
   ret <2 x i64> %ret
@@ -32,12 +31,11 @@ define <2 x i64> @scalarize_v2i64_ones_m
 ; CHECK-LABEL: @scalarize_v2i64_ones_mask(
 ; CHECK-NEXT:    [[PTR0:%.*]] = extractelement <2 x i64*> [[P:%.*]], i32 0
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, i64* [[PTR0]], align 8
-; CHECK-NEXT:    [[RES0:%.*]] = insertelement <2 x i64> undef, i64 [[LOAD0]], i32 0
+; CHECK-NEXT:    [[RES0:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[LOAD0]], i32 0
 ; CHECK-NEXT:    [[PTR1:%.*]] = extractelement <2 x i64*> [[P]], i32 1
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, i64* [[PTR1]], align 8
 ; CHECK-NEXT:    [[RES1:%.*]] = insertelement <2 x i64> [[RES0]], i64 [[LOAD1]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> <i1 true, i1 true>, <2 x i64> [[RES1]], <2 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <2 x i64> [[RES1]]
 ;
   %ret = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %p, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %passthru)
   ret <2 x i64> %ret
@@ -45,8 +43,7 @@ define <2 x i64> @scalarize_v2i64_ones_m
 
 define <2 x i64> @scalarize_v2i64_zero_mask(<2 x i64*> %p, <2 x i64> %passthru) {
 ; CHECK-LABEL: @scalarize_v2i64_zero_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> zeroinitializer, <2 x i64> undef, <2 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <2 x i64> [[PASSTHRU:%.*]]
 ;
   %ret = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %p, i32 8, <2 x i1> <i1 false, i1 false>, <2 x i64> %passthru)
   ret <2 x i64> %ret
@@ -56,9 +53,8 @@ define <2 x i64> @scalarize_v2i64_const_
 ; CHECK-LABEL: @scalarize_v2i64_const_mask(
 ; CHECK-NEXT:    [[PTR1:%.*]] = extractelement <2 x i64*> [[P:%.*]], i32 1
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, i64* [[PTR1]], align 8
-; CHECK-NEXT:    [[RES1:%.*]] = insertelement <2 x i64> undef, i64 [[LOAD1]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> <i1 false, i1 true>, <2 x i64> [[RES1]], <2 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[LOAD1]], i32 1
+; CHECK-NEXT:    ret <2 x i64> [[RES1]]
 ;
   %ret = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %p, i32 8, <2 x i1> <i1 false, i1 true>, <2 x i64> %passthru)
   ret <2 x i64> %ret




More information about the llvm-commits mailing list