[llvm] 75d36dc - [X86][SelectionDAG] Fix the Gather's base and index by modifying the Scale value (#137813)

via llvm-commits llvm-commits at lists.llvm.org
Tue May 13 07:36:52 PDT 2025


Author: Rohit Aggarwal
Date: 2025-05-13T15:36:48+01:00
New Revision: 75d36dc5a77ba3759eef15aeca925a1400504c89

URL: https://github.com/llvm/llvm-project/commit/75d36dc5a77ba3759eef15aeca925a1400504c89
DIFF: https://github.com/llvm/llvm-project/commit/75d36dc5a77ba3759eef15aeca925a1400504c89.diff

LOG: [X86][SelectionDAG] Fix the Gather's base and index by modifying the Scale value (#137813)

Fix the Gather's base and index for one use or multiple uses of Index Node. Using the approach to update the Scale if SHL Opcode and followed by truncate.

---------

Co-authored-by: Rohit Aggarwal <Rohit.Aggarwal at amd.com>
Co-authored-by: Simon Pilgrim <llvm-dev at redking.me.uk>

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/masked_gather_scatter.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e3bb5db07ac40..26da58a140331 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -56712,6 +56712,34 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
 
   if (DCI.isBeforeLegalize()) {
     unsigned IndexWidth = Index.getScalarValueSizeInBits();
+    // Attempt to move shifted index into the address scale, allows further
+    // index truncation below.
+    if (Index.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Scale)) {
+      unsigned ScaleAmt = Scale->getAsZExtVal();
+      assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2");
+      unsigned Log2ScaleAmt = Log2_32(ScaleAmt);
+      unsigned MaskBits = IndexWidth - Log2ScaleAmt;
+      APInt DemandedBits = APInt::getLowBitsSet(IndexWidth, MaskBits);
+      if (TLI.SimplifyDemandedBits(Index, DemandedBits, DCI)) {
+        if (N->getOpcode() != ISD::DELETED_NODE)
+          DCI.AddToWorklist(N);
+        return SDValue(N, 0);
+      }
+      if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {
+        if (*MinShAmt >= 1 && (*MinShAmt + Log2ScaleAmt) < 4 &&
+            DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {
+          SDValue ShAmt = Index.getOperand(1);
+          SDValue NewShAmt =
+              DAG.getNode(ISD::SUB, DL, ShAmt.getValueType(), ShAmt,
+                          DAG.getConstant(1, DL, ShAmt.getValueType()));
+          SDValue NewIndex = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
+                                         Index.getOperand(0), NewShAmt);
+          SDValue NewScale =
+              DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());
+          return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);
+        }
+      }
+    }
 
     // Shrink indices if they are larger than 32-bits.
     // Only do this before legalize types since v2i64 could become v2i32.
@@ -56722,8 +56750,8 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
 
       // FIXME: We could support more than just constant fold, but we need to
       // careful with costing. A truncate that can be optimized out would be
-      // fine. Otherwise we might only want to create a truncate if it avoids a
-      // split.
+      // fine. Otherwise we might only want to create a truncate if it avoids
+      // a split.
       if (SDValue TruncIndex =
               DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))
         return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);
@@ -56737,6 +56765,12 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
         Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
         return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
       }
+
+      // Shrink if we remove an illegal type.
+      if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {
+        Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
+        return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
+      }
     }
   }
 

diff  --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 8b8ae7ff9b088..a5c727e8df9d6 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -604,17 +604,14 @@ define <16 x float> @test13(ptr %base, <16 x i32> %ind) {
 
 ; The base pointer is not splat, can't find unform base
 define <16 x float> @test14(ptr %base, i32 %ind, <16 x ptr> %vec) {
-; X64-KNL-LABEL: test14:
-; X64-KNL:       # %bb.0:
-; X64-KNL-NEXT:    vmovq %xmm0, %rax
-; X64-KNL-NEXT:    vmovd %esi, %xmm0
-; X64-KNL-NEXT:    vpbroadcastd %xmm0, %ymm0
-; X64-KNL-NEXT:    vpmovsxdq %ymm0, %zmm0
-; X64-KNL-NEXT:    kxnorw %k0, %k0, %k1
-; X64-KNL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-KNL-NEXT:    vgatherqps (%rax,%zmm0,4), %ymm1 {%k1}
-; X64-KNL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
-; X64-KNL-NEXT:    retq
+; X64-LABEL: test14:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovq %xmm0, %rax
+; X64-NEXT:    vpbroadcastd %esi, %zmm1
+; X64-NEXT:    kxnorw %k0, %k0, %k1
+; X64-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vgatherdps (%rax,%zmm1,4), %zmm0 {%k1}
+; X64-NEXT:    retq
 ;
 ; X86-LABEL: test14:
 ; X86:       # %bb.0:
@@ -624,17 +621,6 @@ define <16 x float> @test14(ptr %base, i32 %ind, <16 x ptr> %vec) {
 ; X86-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
 ; X86-NEXT:    retl
-;
-; X64-SKX-LABEL: test14:
-; X64-SKX:       # %bb.0:
-; X64-SKX-NEXT:    vmovq %xmm0, %rax
-; X64-SKX-NEXT:    vpbroadcastd %esi, %ymm0
-; X64-SKX-NEXT:    vpmovsxdq %ymm0, %zmm0
-; X64-SKX-NEXT:    kxnorw %k0, %k0, %k1
-; X64-SKX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-SKX-NEXT:    vgatherqps (%rax,%zmm0,4), %ymm1 {%k1}
-; X64-SKX-NEXT:    vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
-; X64-SKX-NEXT:    retq
   %broadcast.splatinsert = insertelement <16 x ptr> %vec, ptr %base, i32 1
   %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 
@@ -5116,13 +5102,13 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
 define <8 x float> @test_gather_structpt2_8f32_mask_index(ptr %x, ptr %arr, <8 x i1> %mask, <8 x float> %src0) {
 ; X64-KNL-LABEL: test_gather_structpt2_8f32_mask_index:
 ; X64-KNL:       # %bb.0:
+; X64-KNL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; X64-KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; X64-KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; X64-KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; X64-KNL-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [536870911,536870911,536870911,536870911,536870911,536870911,536870911,536870911]
-; X64-KNL-NEXT:    vpand (%rsi), %ymm0, %ymm0
-; X64-KNL-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-KNL-NEXT:    vgatherqps (%rdi,%zmm0,8), %ymm1 {%k1}
+; X64-KNL-NEXT:    vbroadcastss {{.*#+}} ymm0 = [536870911,536870911,536870911,536870911,536870911,536870911,536870911,536870911]
+; X64-KNL-NEXT:    vandps (%rsi), %ymm0, %ymm0
+; X64-KNL-NEXT:    vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
 ; X64-KNL-NEXT:    vmovaps %ymm1, %ymm0
 ; X64-KNL-NEXT:    retq
 ;
@@ -5145,10 +5131,9 @@ define <8 x float> @test_gather_structpt2_8f32_mask_index(ptr %x, ptr %arr, <8 x
 ; X64-SKX-SMALL-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; X64-SKX-SMALL-NEXT:    vpslld $31, %ymm0, %ymm0
 ; X64-SKX-SMALL-NEXT:    vpmovd2m %ymm0, %k1
-; X64-SKX-SMALL-NEXT:    vmovdqu (%rsi), %ymm0
-; X64-SKX-SMALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; X64-SKX-SMALL-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-SMALL-NEXT:    vgatherqps (%rdi,%zmm0,8), %ymm1 {%k1}
+; X64-SKX-SMALL-NEXT:    vmovups (%rsi), %ymm0
+; X64-SKX-SMALL-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; X64-SKX-SMALL-NEXT:    vgatherdps (%rdi,%ymm0,8), %ymm1 {%k1}
 ; X64-SKX-SMALL-NEXT:    vmovaps %ymm1, %ymm0
 ; X64-SKX-SMALL-NEXT:    retq
 ;
@@ -5157,11 +5142,10 @@ define <8 x float> @test_gather_structpt2_8f32_mask_index(ptr %x, ptr %arr, <8 x
 ; X64-SKX-LARGE-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; X64-SKX-LARGE-NEXT:    vpslld $31, %ymm0, %ymm0
 ; X64-SKX-LARGE-NEXT:    vpmovd2m %ymm0, %k1
-; X64-SKX-LARGE-NEXT:    vmovdqu (%rsi), %ymm0
+; X64-SKX-LARGE-NEXT:    vmovups (%rsi), %ymm0
 ; X64-SKX-LARGE-NEXT:    movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
-; X64-SKX-LARGE-NEXT:    vpandd (%rax){1to8}, %ymm0, %ymm0
-; X64-SKX-LARGE-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-LARGE-NEXT:    vgatherqps (%rdi,%zmm0,8), %ymm1 {%k1}
+; X64-SKX-LARGE-NEXT:    vandps (%rax){1to8}, %ymm0, %ymm0
+; X64-SKX-LARGE-NEXT:    vgatherdps (%rdi,%ymm0,8), %ymm1 {%k1}
 ; X64-SKX-LARGE-NEXT:    vmovaps %ymm1, %ymm0
 ; X64-SKX-LARGE-NEXT:    retq
 ;
@@ -5187,13 +5171,13 @@ define <8 x float> @test_gather_structpt2_8f32_mask_index(ptr %x, ptr %arr, <8 x
 define <8 x float> @test_gather_structpt2_8f32_mask_index_offset(ptr %x, ptr %arr, <8 x i1> %mask, <8 x float> %src0) {
 ; X64-KNL-LABEL: test_gather_structpt2_8f32_mask_index_offset:
 ; X64-KNL:       # %bb.0:
+; X64-KNL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; X64-KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; X64-KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; X64-KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; X64-KNL-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [536870911,536870911,536870911,536870911,536870911,536870911,536870911,536870911]
-; X64-KNL-NEXT:    vpand (%rsi), %ymm0, %ymm0
-; X64-KNL-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-KNL-NEXT:    vgatherqps 4(%rdi,%zmm0,8), %ymm1 {%k1}
+; X64-KNL-NEXT:    vbroadcastss {{.*#+}} ymm0 = [536870911,536870911,536870911,536870911,536870911,536870911,536870911,536870911]
+; X64-KNL-NEXT:    vandps (%rsi), %ymm0, %ymm0
+; X64-KNL-NEXT:    vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
 ; X64-KNL-NEXT:    vmovaps %ymm1, %ymm0
 ; X64-KNL-NEXT:    retq
 ;
@@ -5216,10 +5200,9 @@ define <8 x float> @test_gather_structpt2_8f32_mask_index_offset(ptr %x, ptr %ar
 ; X64-SKX-SMALL-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; X64-SKX-SMALL-NEXT:    vpslld $31, %ymm0, %ymm0
 ; X64-SKX-SMALL-NEXT:    vpmovd2m %ymm0, %k1
-; X64-SKX-SMALL-NEXT:    vmovdqu (%rsi), %ymm0
-; X64-SKX-SMALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; X64-SKX-SMALL-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-SMALL-NEXT:    vgatherqps 4(%rdi,%zmm0,8), %ymm1 {%k1}
+; X64-SKX-SMALL-NEXT:    vmovups (%rsi), %ymm0
+; X64-SKX-SMALL-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; X64-SKX-SMALL-NEXT:    vgatherdps 4(%rdi,%ymm0,8), %ymm1 {%k1}
 ; X64-SKX-SMALL-NEXT:    vmovaps %ymm1, %ymm0
 ; X64-SKX-SMALL-NEXT:    retq
 ;
@@ -5228,11 +5211,10 @@ define <8 x float> @test_gather_structpt2_8f32_mask_index_offset(ptr %x, ptr %ar
 ; X64-SKX-LARGE-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; X64-SKX-LARGE-NEXT:    vpslld $31, %ymm0, %ymm0
 ; X64-SKX-LARGE-NEXT:    vpmovd2m %ymm0, %k1
-; X64-SKX-LARGE-NEXT:    vmovdqu (%rsi), %ymm0
+; X64-SKX-LARGE-NEXT:    vmovups (%rsi), %ymm0
 ; X64-SKX-LARGE-NEXT:    movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
-; X64-SKX-LARGE-NEXT:    vpandd (%rax){1to8}, %ymm0, %ymm0
-; X64-SKX-LARGE-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-LARGE-NEXT:    vgatherqps 4(%rdi,%zmm0,8), %ymm1 {%k1}
+; X64-SKX-LARGE-NEXT:    vandps (%rax){1to8}, %ymm0, %ymm0
+; X64-SKX-LARGE-NEXT:    vgatherdps 4(%rdi,%ymm0,8), %ymm1 {%k1}
 ; X64-SKX-LARGE-NEXT:    vmovaps %ymm1, %ymm0
 ; X64-SKX-LARGE-NEXT:    retq
 ;
@@ -5258,16 +5240,18 @@ define <8 x float> @test_gather_structpt2_8f32_mask_index_offset(ptr %x, ptr %ar
 define {<8 x float>, <8 x float>} @test_gather_structpt2_8f32_mask_index_pair(ptr %x, ptr %arr, <8 x i1> %mask, <8 x float> %src0) {
 ; X64-KNL-LABEL: test_gather_structpt2_8f32_mask_index_pair:
 ; X64-KNL:       # %bb.0:
+; X64-KNL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; X64-KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; X64-KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; X64-KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; X64-KNL-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [536870911,536870911,536870911,536870911,536870911,536870911,536870911,536870911]
-; X64-KNL-NEXT:    vpand (%rsi), %ymm0, %ymm0
-; X64-KNL-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-KNL-NEXT:    vbroadcastss {{.*#+}} ymm0 = [536870911,536870911,536870911,536870911,536870911,536870911,536870911,536870911]
+; X64-KNL-NEXT:    vandps (%rsi), %ymm0, %ymm2
+; X64-KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-KNL-NEXT:    kmovw %k1, %k2
-; X64-KNL-NEXT:    vmovaps %ymm1, %ymm0
-; X64-KNL-NEXT:    vgatherqps (%rdi,%zmm2,8), %ymm0 {%k2}
-; X64-KNL-NEXT:    vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
+; X64-KNL-NEXT:    vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
+; X64-KNL-NEXT:    vgatherdps 4(%rdi,%zmm2,8), %zmm1 {%k1}
+; X64-KNL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-KNL-NEXT:    # kill: def $ymm1 killed $ymm1 killed $zmm1
 ; X64-KNL-NEXT:    retq
 ;
 ; X86-KNL-LABEL: test_gather_structpt2_8f32_mask_index_pair:
@@ -5293,13 +5277,12 @@ define {<8 x float>, <8 x float>} @test_gather_structpt2_8f32_mask_index_pair(pt
 ; X64-SKX-SMALL-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; X64-SKX-SMALL-NEXT:    vpslld $31, %ymm0, %ymm0
 ; X64-SKX-SMALL-NEXT:    vpmovd2m %ymm0, %k1
-; X64-SKX-SMALL-NEXT:    vmovdqu (%rsi), %ymm0
-; X64-SKX-SMALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; X64-SKX-SMALL-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-SKX-SMALL-NEXT:    vmovups (%rsi), %ymm0
+; X64-SKX-SMALL-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm2
 ; X64-SKX-SMALL-NEXT:    kmovw %k1, %k2
 ; X64-SKX-SMALL-NEXT:    vmovaps %ymm1, %ymm0
-; X64-SKX-SMALL-NEXT:    vgatherqps (%rdi,%zmm2,8), %ymm0 {%k2}
-; X64-SKX-SMALL-NEXT:    vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
+; X64-SKX-SMALL-NEXT:    vgatherdps (%rdi,%ymm2,8), %ymm0 {%k2}
+; X64-SKX-SMALL-NEXT:    vgatherdps 4(%rdi,%ymm2,8), %ymm1 {%k1}
 ; X64-SKX-SMALL-NEXT:    retq
 ;
 ; X64-SKX-LARGE-LABEL: test_gather_structpt2_8f32_mask_index_pair:
@@ -5307,14 +5290,13 @@ define {<8 x float>, <8 x float>} @test_gather_structpt2_8f32_mask_index_pair(pt
 ; X64-SKX-LARGE-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; X64-SKX-LARGE-NEXT:    vpslld $31, %ymm0, %ymm0
 ; X64-SKX-LARGE-NEXT:    vpmovd2m %ymm0, %k1
-; X64-SKX-LARGE-NEXT:    vmovdqu (%rsi), %ymm0
+; X64-SKX-LARGE-NEXT:    vmovups (%rsi), %ymm0
 ; X64-SKX-LARGE-NEXT:    movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
-; X64-SKX-LARGE-NEXT:    vpandd (%rax){1to8}, %ymm0, %ymm0
-; X64-SKX-LARGE-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-SKX-LARGE-NEXT:    vandps (%rax){1to8}, %ymm0, %ymm2
 ; X64-SKX-LARGE-NEXT:    kmovw %k1, %k2
 ; X64-SKX-LARGE-NEXT:    vmovaps %ymm1, %ymm0
-; X64-SKX-LARGE-NEXT:    vgatherqps (%rdi,%zmm2,8), %ymm0 {%k2}
-; X64-SKX-LARGE-NEXT:    vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
+; X64-SKX-LARGE-NEXT:    vgatherdps (%rdi,%ymm2,8), %ymm0 {%k2}
+; X64-SKX-LARGE-NEXT:    vgatherdps 4(%rdi,%ymm2,8), %ymm1 {%k1}
 ; X64-SKX-LARGE-NEXT:    retq
 ;
 ; X86-SKX-LABEL: test_gather_structpt2_8f32_mask_index_pair:
@@ -5350,14 +5332,8 @@ define <16 x float> @test_gather_structpt2_16f32_mask_index(ptr %x, ptr %arr, <1
 ; X64-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; X64-KNL-NEXT:    vmovdqu64 (%rsi), %zmm0
 ; X64-KNL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-KNL-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; X64-KNL-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-KNL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
-; X64-KNL-NEXT:    kshiftrw $8, %k1, %k2
-; X64-KNL-NEXT:    vgatherqps (%rdi,%zmm0,8), %ymm3 {%k2}
-; X64-KNL-NEXT:    vgatherqps (%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-KNL-NEXT:    vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
+; X64-KNL-NEXT:    vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
+; X64-KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-KNL-NEXT:    retq
 ;
 ; X86-KNL-LABEL: test_gather_structpt2_16f32_mask_index:
@@ -5377,16 +5353,10 @@ define <16 x float> @test_gather_structpt2_16f32_mask_index(ptr %x, ptr %arr, <1
 ; X64-SKX-SMALL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-SMALL-NEXT:    vmovdqu64 (%rsi), %zmm0
-; X64-SKX-SMALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-SKX-SMALL-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-SMALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; X64-SKX-SMALL-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-SMALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
-; X64-SKX-SMALL-NEXT:    kshiftrw $8, %k1, %k2
-; X64-SKX-SMALL-NEXT:    vgatherqps (%rdi,%zmm0,8), %ymm3 {%k2}
-; X64-SKX-SMALL-NEXT:    vgatherqps (%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-SKX-SMALL-NEXT:    vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
+; X64-SKX-SMALL-NEXT:    vmovups (%rsi), %zmm0
+; X64-SKX-SMALL-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; X64-SKX-SMALL-NEXT:    vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
+; X64-SKX-SMALL-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-SKX-SMALL-NEXT:    retq
 ;
 ; X64-SKX-LARGE-LABEL: test_gather_structpt2_16f32_mask_index:
@@ -5394,17 +5364,11 @@ define <16 x float> @test_gather_structpt2_16f32_mask_index(ptr %x, ptr %arr, <1
 ; X64-SKX-LARGE-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-LARGE-NEXT:    vmovdqu64 (%rsi), %zmm0
+; X64-SKX-LARGE-NEXT:    vmovups (%rsi), %zmm0
 ; X64-SKX-LARGE-NEXT:    movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
-; X64-SKX-LARGE-NEXT:    vpandd (%rax){1to16}, %zmm0, %zmm0
-; X64-SKX-LARGE-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-LARGE-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; X64-SKX-LARGE-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-LARGE-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
-; X64-SKX-LARGE-NEXT:    kshiftrw $8, %k1, %k2
-; X64-SKX-LARGE-NEXT:    vgatherqps (%rdi,%zmm0,8), %ymm3 {%k2}
-; X64-SKX-LARGE-NEXT:    vgatherqps (%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-SKX-LARGE-NEXT:    vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
+; X64-SKX-LARGE-NEXT:    vandps (%rax){1to16}, %zmm0, %zmm0
+; X64-SKX-LARGE-NEXT:    vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
+; X64-SKX-LARGE-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-SKX-LARGE-NEXT:    retq
 ;
 ; X86-SKX-LABEL: test_gather_structpt2_16f32_mask_index:
@@ -5434,14 +5398,8 @@ define <16 x float> @test_gather_structpt2_16f32_mask_index_offset(ptr %x, ptr %
 ; X64-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; X64-KNL-NEXT:    vmovdqu64 (%rsi), %zmm0
 ; X64-KNL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-KNL-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; X64-KNL-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-KNL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
-; X64-KNL-NEXT:    kshiftrw $8, %k1, %k2
-; X64-KNL-NEXT:    vgatherqps 4(%rdi,%zmm0,8), %ymm3 {%k2}
-; X64-KNL-NEXT:    vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-KNL-NEXT:    vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
+; X64-KNL-NEXT:    vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
+; X64-KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-KNL-NEXT:    retq
 ;
 ; X86-KNL-LABEL: test_gather_structpt2_16f32_mask_index_offset:
@@ -5461,16 +5419,10 @@ define <16 x float> @test_gather_structpt2_16f32_mask_index_offset(ptr %x, ptr %
 ; X64-SKX-SMALL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-SMALL-NEXT:    vmovdqu64 (%rsi), %zmm0
-; X64-SKX-SMALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-SKX-SMALL-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-SMALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; X64-SKX-SMALL-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-SMALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
-; X64-SKX-SMALL-NEXT:    kshiftrw $8, %k1, %k2
-; X64-SKX-SMALL-NEXT:    vgatherqps 4(%rdi,%zmm0,8), %ymm3 {%k2}
-; X64-SKX-SMALL-NEXT:    vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-SKX-SMALL-NEXT:    vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
+; X64-SKX-SMALL-NEXT:    vmovups (%rsi), %zmm0
+; X64-SKX-SMALL-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; X64-SKX-SMALL-NEXT:    vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
+; X64-SKX-SMALL-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-SKX-SMALL-NEXT:    retq
 ;
 ; X64-SKX-LARGE-LABEL: test_gather_structpt2_16f32_mask_index_offset:
@@ -5478,17 +5430,11 @@ define <16 x float> @test_gather_structpt2_16f32_mask_index_offset(ptr %x, ptr %
 ; X64-SKX-LARGE-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-LARGE-NEXT:    vmovdqu64 (%rsi), %zmm0
+; X64-SKX-LARGE-NEXT:    vmovups (%rsi), %zmm0
 ; X64-SKX-LARGE-NEXT:    movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
-; X64-SKX-LARGE-NEXT:    vpandd (%rax){1to16}, %zmm0, %zmm0
-; X64-SKX-LARGE-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-LARGE-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; X64-SKX-LARGE-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-LARGE-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
-; X64-SKX-LARGE-NEXT:    kshiftrw $8, %k1, %k2
-; X64-SKX-LARGE-NEXT:    vgatherqps 4(%rdi,%zmm0,8), %ymm3 {%k2}
-; X64-SKX-LARGE-NEXT:    vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-SKX-LARGE-NEXT:    vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
+; X64-SKX-LARGE-NEXT:    vandps (%rax){1to16}, %zmm0, %zmm0
+; X64-SKX-LARGE-NEXT:    vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
+; X64-SKX-LARGE-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-SKX-LARGE-NEXT:    retq
 ;
 ; X86-SKX-LABEL: test_gather_structpt2_16f32_mask_index_offset:
@@ -5517,22 +5463,11 @@ define {<16 x float>, <16 x float>} @test_gather_structpt2_16f32_mask_index_pair
 ; X64-KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; X64-KNL-NEXT:    vmovdqu64 (%rsi), %zmm0
-; X64-KNL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-KNL-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; X64-KNL-NEXT:    vpmovzxdq {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-KNL-NEXT:    vextractf64x4 $1, %zmm1, %ymm4
-; X64-KNL-NEXT:    kshiftrw $8, %k1, %k2
-; X64-KNL-NEXT:    vmovaps %ymm4, %ymm0
-; X64-KNL-NEXT:    kmovw %k2, %k3
-; X64-KNL-NEXT:    vgatherqps (%rdi,%zmm3,8), %ymm0 {%k3}
-; X64-KNL-NEXT:    vmovaps %ymm1, %ymm5
-; X64-KNL-NEXT:    kmovw %k1, %k3
-; X64-KNL-NEXT:    vgatherqps (%rdi,%zmm2,8), %ymm5 {%k3}
-; X64-KNL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
-; X64-KNL-NEXT:    vgatherqps 4(%rdi,%zmm3,8), %ymm4 {%k2}
-; X64-KNL-NEXT:    vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-KNL-NEXT:    vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
+; X64-KNL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2
+; X64-KNL-NEXT:    kmovw %k1, %k2
+; X64-KNL-NEXT:    vmovaps %zmm1, %zmm0
+; X64-KNL-NEXT:    vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
+; X64-KNL-NEXT:    vgatherdps 4(%rdi,%zmm2,8), %zmm1 {%k1}
 ; X64-KNL-NEXT:    retq
 ;
 ; X86-KNL-LABEL: test_gather_structpt2_16f32_mask_index_pair:
@@ -5554,23 +5489,12 @@ define {<16 x float>, <16 x float>} @test_gather_structpt2_16f32_mask_index_pair
 ; X64-SKX-SMALL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-SMALL-NEXT:    vmovdqu64 (%rsi), %zmm0
-; X64-SKX-SMALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-SKX-SMALL-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-SMALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; X64-SKX-SMALL-NEXT:    vpmovzxdq {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-SMALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm4
-; X64-SKX-SMALL-NEXT:    kshiftrw $8, %k1, %k2
-; X64-SKX-SMALL-NEXT:    vmovaps %ymm4, %ymm0
-; X64-SKX-SMALL-NEXT:    kmovw %k2, %k3
-; X64-SKX-SMALL-NEXT:    vgatherqps (%rdi,%zmm3,8), %ymm0 {%k3}
-; X64-SKX-SMALL-NEXT:    vmovaps %ymm1, %ymm5
-; X64-SKX-SMALL-NEXT:    kmovw %k1, %k3
-; X64-SKX-SMALL-NEXT:    vgatherqps (%rdi,%zmm2,8), %ymm5 {%k3}
-; X64-SKX-SMALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
-; X64-SKX-SMALL-NEXT:    vgatherqps 4(%rdi,%zmm3,8), %ymm4 {%k2}
-; X64-SKX-SMALL-NEXT:    vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-SKX-SMALL-NEXT:    vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
+; X64-SKX-SMALL-NEXT:    vmovups (%rsi), %zmm0
+; X64-SKX-SMALL-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2
+; X64-SKX-SMALL-NEXT:    kmovw %k1, %k2
+; X64-SKX-SMALL-NEXT:    vmovaps %zmm1, %zmm0
+; X64-SKX-SMALL-NEXT:    vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
+; X64-SKX-SMALL-NEXT:    vgatherdps 4(%rdi,%zmm2,8), %zmm1 {%k1}
 ; X64-SKX-SMALL-NEXT:    retq
 ;
 ; X64-SKX-LARGE-LABEL: test_gather_structpt2_16f32_mask_index_pair:
@@ -5578,24 +5502,13 @@ define {<16 x float>, <16 x float>} @test_gather_structpt2_16f32_mask_index_pair
 ; X64-SKX-LARGE-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-LARGE-NEXT:    vmovdqu64 (%rsi), %zmm0
+; X64-SKX-LARGE-NEXT:    vmovups (%rsi), %zmm0
 ; X64-SKX-LARGE-NEXT:    movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
-; X64-SKX-LARGE-NEXT:    vpandd (%rax){1to16}, %zmm0, %zmm0
-; X64-SKX-LARGE-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-LARGE-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; X64-SKX-LARGE-NEXT:    vpmovzxdq {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-LARGE-NEXT:    vextractf64x4 $1, %zmm1, %ymm4
-; X64-SKX-LARGE-NEXT:    kshiftrw $8, %k1, %k2
-; X64-SKX-LARGE-NEXT:    kmovw %k2, %k3
-; X64-SKX-LARGE-NEXT:    vmovaps %ymm4, %ymm0
-; X64-SKX-LARGE-NEXT:    vgatherqps (%rdi,%zmm3,8), %ymm0 {%k3}
-; X64-SKX-LARGE-NEXT:    kmovw %k1, %k3
-; X64-SKX-LARGE-NEXT:    vmovaps %ymm1, %ymm5
-; X64-SKX-LARGE-NEXT:    vgatherqps (%rdi,%zmm2,8), %ymm5 {%k3}
-; X64-SKX-LARGE-NEXT:    vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
-; X64-SKX-LARGE-NEXT:    vgatherqps 4(%rdi,%zmm3,8), %ymm4 {%k2}
-; X64-SKX-LARGE-NEXT:    vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-SKX-LARGE-NEXT:    vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
+; X64-SKX-LARGE-NEXT:    vandps (%rax){1to16}, %zmm0, %zmm2
+; X64-SKX-LARGE-NEXT:    kmovw %k1, %k2
+; X64-SKX-LARGE-NEXT:    vmovaps %zmm1, %zmm0
+; X64-SKX-LARGE-NEXT:    vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
+; X64-SKX-LARGE-NEXT:    vgatherdps 4(%rdi,%zmm2,8), %zmm1 {%k1}
 ; X64-SKX-LARGE-NEXT:    retq
 ;
 ; X86-SKX-LABEL: test_gather_structpt2_16f32_mask_index_pair:


        


More information about the llvm-commits mailing list