[llvm] 75d36dc - [X86][SelectionDAG] Fix the Gather's base and index by modifying the Scale value (#137813)
via llvm-commits
llvm-commits at lists.llvm.org
Tue May 13 07:36:52 PDT 2025
Author: Rohit Aggarwal
Date: 2025-05-13T15:36:48+01:00
New Revision: 75d36dc5a77ba3759eef15aeca925a1400504c89
URL: https://github.com/llvm/llvm-project/commit/75d36dc5a77ba3759eef15aeca925a1400504c89
DIFF: https://github.com/llvm/llvm-project/commit/75d36dc5a77ba3759eef15aeca925a1400504c89.diff
LOG: [X86][SelectionDAG] Fix the Gather's base and index by modifying the Scale value (#137813)
Fix the Gather's base and index for one use or multiple uses of Index Node. Using the approach to update the Scale if SHL Opcode and followed by truncate.
---------
Co-authored-by: Rohit Aggarwal <Rohit.Aggarwal at amd.com>
Co-authored-by: Simon Pilgrim <llvm-dev at redking.me.uk>
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/masked_gather_scatter.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e3bb5db07ac40..26da58a140331 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -56712,6 +56712,34 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalize()) {
unsigned IndexWidth = Index.getScalarValueSizeInBits();
+ // Attempt to move shifted index into the address scale, allows further
+ // index truncation below.
+ if (Index.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Scale)) {
+ unsigned ScaleAmt = Scale->getAsZExtVal();
+ assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2");
+ unsigned Log2ScaleAmt = Log2_32(ScaleAmt);
+ unsigned MaskBits = IndexWidth - Log2ScaleAmt;
+ APInt DemandedBits = APInt::getLowBitsSet(IndexWidth, MaskBits);
+ if (TLI.SimplifyDemandedBits(Index, DemandedBits, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+ if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {
+ if (*MinShAmt >= 1 && (*MinShAmt + Log2ScaleAmt) < 4 &&
+ DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {
+ SDValue ShAmt = Index.getOperand(1);
+ SDValue NewShAmt =
+ DAG.getNode(ISD::SUB, DL, ShAmt.getValueType(), ShAmt,
+ DAG.getConstant(1, DL, ShAmt.getValueType()));
+ SDValue NewIndex = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
+ Index.getOperand(0), NewShAmt);
+ SDValue NewScale =
+ DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());
+ return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);
+ }
+ }
+ }
// Shrink indices if they are larger than 32-bits.
// Only do this before legalize types since v2i64 could become v2i32.
@@ -56722,8 +56750,8 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
// FIXME: We could support more than just constant fold, but we need to
// careful with costing. A truncate that can be optimized out would be
- // fine. Otherwise we might only want to create a truncate if it avoids a
- // split.
+ // fine. Otherwise we might only want to create a truncate if it avoids
+ // a split.
if (SDValue TruncIndex =
DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))
return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);
@@ -56737,6 +56765,12 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
}
+
+ // Shrink if we remove an illegal type.
+ if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {
+ Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
+ return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
+ }
}
}
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 8b8ae7ff9b088..a5c727e8df9d6 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -604,17 +604,14 @@ define <16 x float> @test13(ptr %base, <16 x i32> %ind) {
; The base pointer is not splat, can't find unform base
define <16 x float> @test14(ptr %base, i32 %ind, <16 x ptr> %vec) {
-; X64-KNL-LABEL: test14:
-; X64-KNL: # %bb.0:
-; X64-KNL-NEXT: vmovq %xmm0, %rax
-; X64-KNL-NEXT: vmovd %esi, %xmm0
-; X64-KNL-NEXT: vpbroadcastd %xmm0, %ymm0
-; X64-KNL-NEXT: vpmovsxdq %ymm0, %zmm0
-; X64-KNL-NEXT: kxnorw %k0, %k0, %k1
-; X64-KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-KNL-NEXT: vgatherqps (%rax,%zmm0,4), %ymm1 {%k1}
-; X64-KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
-; X64-KNL-NEXT: retq
+; X64-LABEL: test14:
+; X64: # %bb.0:
+; X64-NEXT: vmovq %xmm0, %rax
+; X64-NEXT: vpbroadcastd %esi, %zmm1
+; X64-NEXT: kxnorw %k0, %k0, %k1
+; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; X64-NEXT: vgatherdps (%rax,%zmm1,4), %zmm0 {%k1}
+; X64-NEXT: retq
;
; X86-LABEL: test14:
; X86: # %bb.0:
@@ -624,17 +621,6 @@ define <16 x float> @test14(ptr %base, i32 %ind, <16 x ptr> %vec) {
; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0
; X86-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; X86-NEXT: retl
-;
-; X64-SKX-LABEL: test14:
-; X64-SKX: # %bb.0:
-; X64-SKX-NEXT: vmovq %xmm0, %rax
-; X64-SKX-NEXT: vpbroadcastd %esi, %ymm0
-; X64-SKX-NEXT: vpmovsxdq %ymm0, %zmm0
-; X64-SKX-NEXT: kxnorw %k0, %k0, %k1
-; X64-SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-SKX-NEXT: vgatherqps (%rax,%zmm0,4), %ymm1 {%k1}
-; X64-SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
-; X64-SKX-NEXT: retq
%broadcast.splatinsert = insertelement <16 x ptr> %vec, ptr %base, i32 1
%broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
@@ -5116,13 +5102,13 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
define <8 x float> @test_gather_structpt2_8f32_mask_index(ptr %x, ptr %arr, <8 x i1> %mask, <8 x float> %src0) {
; X64-KNL-LABEL: test_gather_structpt2_8f32_mask_index:
; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; X64-KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; X64-KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; X64-KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; X64-KNL-NEXT: vpbroadcastd {{.*#+}} ymm0 = [536870911,536870911,536870911,536870911,536870911,536870911,536870911,536870911]
-; X64-KNL-NEXT: vpand (%rsi), %ymm0, %ymm0
-; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-KNL-NEXT: vgatherqps (%rdi,%zmm0,8), %ymm1 {%k1}
+; X64-KNL-NEXT: vbroadcastss {{.*#+}} ymm0 = [536870911,536870911,536870911,536870911,536870911,536870911,536870911,536870911]
+; X64-KNL-NEXT: vandps (%rsi), %ymm0, %ymm0
+; X64-KNL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
; X64-KNL-NEXT: vmovaps %ymm1, %ymm0
; X64-KNL-NEXT: retq
;
@@ -5145,10 +5131,9 @@ define <8 x float> @test_gather_structpt2_8f32_mask_index(ptr %x, ptr %arr, <8 x
; X64-SKX-SMALL-NEXT: vpmovsxwd %xmm0, %ymm0
; X64-SKX-SMALL-NEXT: vpslld $31, %ymm0, %ymm0
; X64-SKX-SMALL-NEXT: vpmovd2m %ymm0, %k1
-; X64-SKX-SMALL-NEXT: vmovdqu (%rsi), %ymm0
-; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm0,8), %ymm1 {%k1}
+; X64-SKX-SMALL-NEXT: vmovups (%rsi), %ymm0
+; X64-SKX-SMALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%ymm0,8), %ymm1 {%k1}
; X64-SKX-SMALL-NEXT: vmovaps %ymm1, %ymm0
; X64-SKX-SMALL-NEXT: retq
;
@@ -5157,11 +5142,10 @@ define <8 x float> @test_gather_structpt2_8f32_mask_index(ptr %x, ptr %arr, <8 x
; X64-SKX-LARGE-NEXT: vpmovsxwd %xmm0, %ymm0
; X64-SKX-LARGE-NEXT: vpslld $31, %ymm0, %ymm0
; X64-SKX-LARGE-NEXT: vpmovd2m %ymm0, %k1
-; X64-SKX-LARGE-NEXT: vmovdqu (%rsi), %ymm0
+; X64-SKX-LARGE-NEXT: vmovups (%rsi), %ymm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
-; X64-SKX-LARGE-NEXT: vpandd (%rax){1to8}, %ymm0, %ymm0
-; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm0,8), %ymm1 {%k1}
+; X64-SKX-LARGE-NEXT: vandps (%rax){1to8}, %ymm0, %ymm0
+; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%ymm0,8), %ymm1 {%k1}
; X64-SKX-LARGE-NEXT: vmovaps %ymm1, %ymm0
; X64-SKX-LARGE-NEXT: retq
;
@@ -5187,13 +5171,13 @@ define <8 x float> @test_gather_structpt2_8f32_mask_index(ptr %x, ptr %arr, <8 x
define <8 x float> @test_gather_structpt2_8f32_mask_index_offset(ptr %x, ptr %arr, <8 x i1> %mask, <8 x float> %src0) {
; X64-KNL-LABEL: test_gather_structpt2_8f32_mask_index_offset:
; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; X64-KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; X64-KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; X64-KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; X64-KNL-NEXT: vpbroadcastd {{.*#+}} ymm0 = [536870911,536870911,536870911,536870911,536870911,536870911,536870911,536870911]
-; X64-KNL-NEXT: vpand (%rsi), %ymm0, %ymm0
-; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm0,8), %ymm1 {%k1}
+; X64-KNL-NEXT: vbroadcastss {{.*#+}} ymm0 = [536870911,536870911,536870911,536870911,536870911,536870911,536870911,536870911]
+; X64-KNL-NEXT: vandps (%rsi), %ymm0, %ymm0
+; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
; X64-KNL-NEXT: vmovaps %ymm1, %ymm0
; X64-KNL-NEXT: retq
;
@@ -5216,10 +5200,9 @@ define <8 x float> @test_gather_structpt2_8f32_mask_index_offset(ptr %x, ptr %ar
; X64-SKX-SMALL-NEXT: vpmovsxwd %xmm0, %ymm0
; X64-SKX-SMALL-NEXT: vpslld $31, %ymm0, %ymm0
; X64-SKX-SMALL-NEXT: vpmovd2m %ymm0, %k1
-; X64-SKX-SMALL-NEXT: vmovdqu (%rsi), %ymm0
-; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm0,8), %ymm1 {%k1}
+; X64-SKX-SMALL-NEXT: vmovups (%rsi), %ymm0
+; X64-SKX-SMALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; X64-SKX-SMALL-NEXT: vgatherdps 4(%rdi,%ymm0,8), %ymm1 {%k1}
; X64-SKX-SMALL-NEXT: vmovaps %ymm1, %ymm0
; X64-SKX-SMALL-NEXT: retq
;
@@ -5228,11 +5211,10 @@ define <8 x float> @test_gather_structpt2_8f32_mask_index_offset(ptr %x, ptr %ar
; X64-SKX-LARGE-NEXT: vpmovsxwd %xmm0, %ymm0
; X64-SKX-LARGE-NEXT: vpslld $31, %ymm0, %ymm0
; X64-SKX-LARGE-NEXT: vpmovd2m %ymm0, %k1
-; X64-SKX-LARGE-NEXT: vmovdqu (%rsi), %ymm0
+; X64-SKX-LARGE-NEXT: vmovups (%rsi), %ymm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
-; X64-SKX-LARGE-NEXT: vpandd (%rax){1to8}, %ymm0, %ymm0
-; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm0,8), %ymm1 {%k1}
+; X64-SKX-LARGE-NEXT: vandps (%rax){1to8}, %ymm0, %ymm0
+; X64-SKX-LARGE-NEXT: vgatherdps 4(%rdi,%ymm0,8), %ymm1 {%k1}
; X64-SKX-LARGE-NEXT: vmovaps %ymm1, %ymm0
; X64-SKX-LARGE-NEXT: retq
;
@@ -5258,16 +5240,18 @@ define <8 x float> @test_gather_structpt2_8f32_mask_index_offset(ptr %x, ptr %ar
define {<8 x float>, <8 x float>} @test_gather_structpt2_8f32_mask_index_pair(ptr %x, ptr %arr, <8 x i1> %mask, <8 x float> %src0) {
; X64-KNL-LABEL: test_gather_structpt2_8f32_mask_index_pair:
; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; X64-KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; X64-KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; X64-KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; X64-KNL-NEXT: vpbroadcastd {{.*#+}} ymm0 = [536870911,536870911,536870911,536870911,536870911,536870911,536870911,536870911]
-; X64-KNL-NEXT: vpand (%rsi), %ymm0, %ymm0
-; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-KNL-NEXT: vbroadcastss {{.*#+}} ymm0 = [536870911,536870911,536870911,536870911,536870911,536870911,536870911,536870911]
+; X64-KNL-NEXT: vandps (%rsi), %ymm0, %ymm2
+; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
; X64-KNL-NEXT: kmovw %k1, %k2
-; X64-KNL-NEXT: vmovaps %ymm1, %ymm0
-; X64-KNL-NEXT: vgatherqps (%rdi,%zmm2,8), %ymm0 {%k2}
-; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
+; X64-KNL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
+; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm2,8), %zmm1 {%k1}
+; X64-KNL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-KNL-NEXT: # kill: def $ymm1 killed $ymm1 killed $zmm1
; X64-KNL-NEXT: retq
;
; X86-KNL-LABEL: test_gather_structpt2_8f32_mask_index_pair:
@@ -5293,13 +5277,12 @@ define {<8 x float>, <8 x float>} @test_gather_structpt2_8f32_mask_index_pair(pt
; X64-SKX-SMALL-NEXT: vpmovsxwd %xmm0, %ymm0
; X64-SKX-SMALL-NEXT: vpslld $31, %ymm0, %ymm0
; X64-SKX-SMALL-NEXT: vpmovd2m %ymm0, %k1
-; X64-SKX-SMALL-NEXT: vmovdqu (%rsi), %ymm0
-; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-SKX-SMALL-NEXT: vmovups (%rsi), %ymm0
+; X64-SKX-SMALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm2
; X64-SKX-SMALL-NEXT: kmovw %k1, %k2
; X64-SKX-SMALL-NEXT: vmovaps %ymm1, %ymm0
-; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm2,8), %ymm0 {%k2}
-; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
+; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%ymm2,8), %ymm0 {%k2}
+; X64-SKX-SMALL-NEXT: vgatherdps 4(%rdi,%ymm2,8), %ymm1 {%k1}
; X64-SKX-SMALL-NEXT: retq
;
; X64-SKX-LARGE-LABEL: test_gather_structpt2_8f32_mask_index_pair:
@@ -5307,14 +5290,13 @@ define {<8 x float>, <8 x float>} @test_gather_structpt2_8f32_mask_index_pair(pt
; X64-SKX-LARGE-NEXT: vpmovsxwd %xmm0, %ymm0
; X64-SKX-LARGE-NEXT: vpslld $31, %ymm0, %ymm0
; X64-SKX-LARGE-NEXT: vpmovd2m %ymm0, %k1
-; X64-SKX-LARGE-NEXT: vmovdqu (%rsi), %ymm0
+; X64-SKX-LARGE-NEXT: vmovups (%rsi), %ymm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
-; X64-SKX-LARGE-NEXT: vpandd (%rax){1to8}, %ymm0, %ymm0
-; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-SKX-LARGE-NEXT: vandps (%rax){1to8}, %ymm0, %ymm2
; X64-SKX-LARGE-NEXT: kmovw %k1, %k2
; X64-SKX-LARGE-NEXT: vmovaps %ymm1, %ymm0
-; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm2,8), %ymm0 {%k2}
-; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
+; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%ymm2,8), %ymm0 {%k2}
+; X64-SKX-LARGE-NEXT: vgatherdps 4(%rdi,%ymm2,8), %ymm1 {%k1}
; X64-SKX-LARGE-NEXT: retq
;
; X86-SKX-LABEL: test_gather_structpt2_8f32_mask_index_pair:
@@ -5350,14 +5332,8 @@ define <16 x float> @test_gather_structpt2_16f32_mask_index(ptr %x, ptr %arr, <1
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
-; X64-KNL-NEXT: kshiftrw $8, %k1, %k2
-; X64-KNL-NEXT: vgatherqps (%rdi,%zmm0,8), %ymm3 {%k2}
-; X64-KNL-NEXT: vgatherqps (%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-KNL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
+; X64-KNL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
+; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
; X64-KNL-NEXT: retq
;
; X86-KNL-LABEL: test_gather_structpt2_16f32_mask_index:
@@ -5377,16 +5353,10 @@ define <16 x float> @test_gather_structpt2_16f32_mask_index(ptr %x, ptr %arr, <1
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
-; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
-; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
-; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2
-; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm0,8), %ymm3 {%k2}
-; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
+; X64-SKX-SMALL-NEXT: vmovups (%rsi), %zmm0
+; X64-SKX-SMALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
+; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-SMALL-NEXT: retq
;
; X64-SKX-LARGE-LABEL: test_gather_structpt2_16f32_mask_index:
@@ -5394,17 +5364,11 @@ define <16 x float> @test_gather_structpt2_16f32_mask_index(ptr %x, ptr %arr, <1
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
-; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
+; X64-SKX-LARGE-NEXT: vmovups (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
-; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
-; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3
-; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2
-; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm0,8), %ymm3 {%k2}
-; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
+; X64-SKX-LARGE-NEXT: vandps (%rax){1to16}, %zmm0, %zmm0
+; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
+; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-LARGE-NEXT: retq
;
; X86-SKX-LABEL: test_gather_structpt2_16f32_mask_index:
@@ -5434,14 +5398,8 @@ define <16 x float> @test_gather_structpt2_16f32_mask_index_offset(ptr %x, ptr %
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
-; X64-KNL-NEXT: kshiftrw $8, %k1, %k2
-; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm0,8), %ymm3 {%k2}
-; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-KNL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
+; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
+; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
; X64-KNL-NEXT: retq
;
; X86-KNL-LABEL: test_gather_structpt2_16f32_mask_index_offset:
@@ -5461,16 +5419,10 @@ define <16 x float> @test_gather_structpt2_16f32_mask_index_offset(ptr %x, ptr %
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
-; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
-; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
-; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2
-; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm0,8), %ymm3 {%k2}
-; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
+; X64-SKX-SMALL-NEXT: vmovups (%rsi), %zmm0
+; X64-SKX-SMALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; X64-SKX-SMALL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
+; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-SMALL-NEXT: retq
;
; X64-SKX-LARGE-LABEL: test_gather_structpt2_16f32_mask_index_offset:
@@ -5478,17 +5430,11 @@ define <16 x float> @test_gather_structpt2_16f32_mask_index_offset(ptr %x, ptr %
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
-; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
+; X64-SKX-LARGE-NEXT: vmovups (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
-; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
-; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3
-; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2
-; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm0,8), %ymm3 {%k2}
-; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
+; X64-SKX-LARGE-NEXT: vandps (%rax){1to16}, %zmm0, %zmm0
+; X64-SKX-LARGE-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
+; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-LARGE-NEXT: retq
;
; X86-SKX-LABEL: test_gather_structpt2_16f32_mask_index_offset:
@@ -5517,22 +5463,11 @@ define {<16 x float>, <16 x float>} @test_gather_structpt2_16f32_mask_index_pair
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
-; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm4
-; X64-KNL-NEXT: kshiftrw $8, %k1, %k2
-; X64-KNL-NEXT: vmovaps %ymm4, %ymm0
-; X64-KNL-NEXT: kmovw %k2, %k3
-; X64-KNL-NEXT: vgatherqps (%rdi,%zmm3,8), %ymm0 {%k3}
-; X64-KNL-NEXT: vmovaps %ymm1, %ymm5
-; X64-KNL-NEXT: kmovw %k1, %k3
-; X64-KNL-NEXT: vgatherqps (%rdi,%zmm2,8), %ymm5 {%k3}
-; X64-KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
-; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm3,8), %ymm4 {%k2}
-; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-KNL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
+; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2
+; X64-KNL-NEXT: kmovw %k1, %k2
+; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
+; X64-KNL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
+; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm2,8), %zmm1 {%k1}
; X64-KNL-NEXT: retq
;
; X86-KNL-LABEL: test_gather_structpt2_16f32_mask_index_pair:
@@ -5554,23 +5489,12 @@ define {<16 x float>, <16 x float>} @test_gather_structpt2_16f32_mask_index_pair
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
-; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
-; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm4
-; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2
-; X64-SKX-SMALL-NEXT: vmovaps %ymm4, %ymm0
-; X64-SKX-SMALL-NEXT: kmovw %k2, %k3
-; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm3,8), %ymm0 {%k3}
-; X64-SKX-SMALL-NEXT: vmovaps %ymm1, %ymm5
-; X64-SKX-SMALL-NEXT: kmovw %k1, %k3
-; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm2,8), %ymm5 {%k3}
-; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
-; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm3,8), %ymm4 {%k2}
-; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
+; X64-SKX-SMALL-NEXT: vmovups (%rsi), %zmm0
+; X64-SKX-SMALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2
+; X64-SKX-SMALL-NEXT: kmovw %k1, %k2
+; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
+; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
+; X64-SKX-SMALL-NEXT: vgatherdps 4(%rdi,%zmm2,8), %zmm1 {%k1}
; X64-SKX-SMALL-NEXT: retq
;
; X64-SKX-LARGE-LABEL: test_gather_structpt2_16f32_mask_index_pair:
@@ -5578,24 +5502,13 @@ define {<16 x float>, <16 x float>} @test_gather_structpt2_16f32_mask_index_pair
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
-; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
+; X64-SKX-LARGE-NEXT: vmovups (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
-; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
-; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm4
-; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2
-; X64-SKX-LARGE-NEXT: kmovw %k2, %k3
-; X64-SKX-LARGE-NEXT: vmovaps %ymm4, %ymm0
-; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm3,8), %ymm0 {%k3}
-; X64-SKX-LARGE-NEXT: kmovw %k1, %k3
-; X64-SKX-LARGE-NEXT: vmovaps %ymm1, %ymm5
-; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm2,8), %ymm5 {%k3}
-; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
-; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm3,8), %ymm4 {%k2}
-; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm2,8), %ymm1 {%k1}
-; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
+; X64-SKX-LARGE-NEXT: vandps (%rax){1to16}, %zmm0, %zmm2
+; X64-SKX-LARGE-NEXT: kmovw %k1, %k2
+; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
+; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
+; X64-SKX-LARGE-NEXT: vgatherdps 4(%rdi,%zmm2,8), %zmm1 {%k1}
; X64-SKX-LARGE-NEXT: retq
;
; X86-SKX-LABEL: test_gather_structpt2_16f32_mask_index_pair:
More information about the llvm-commits
mailing list