[llvm] 2bc28c6 - [X86] Add a dependency breaking xor before any gathers with an undef passthru value.
Phoebe Wang via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 27 20:45:22 PDT 2021
Author: Phoebe Wang
Date: 2021-10-28T11:44:52+08:00
New Revision: 2bc28c6f825ec7b33854288970c6646e17425d62
URL: https://github.com/llvm/llvm-project/commit/2bc28c6f825ec7b33854288970c6646e17425d62
DIFF: https://github.com/llvm/llvm-project/commit/2bc28c6f825ec7b33854288970c6646e17425d62.diff
LOG: [X86] Add a dependency breaking xor before any gathers with an undef passthru value.
In the instruction encoding, the passthru register is always
tied to the destination register. The CPU scheduler has to wait
for the last writer of this register to finish executing before
the gather can start. This is true even if the initial mask is
all ones so that the passthru will never be used.
By explicitly zeroing the register we can break the false
dependency. The zero idiom is executed completing by the
register renamer and so is immedately considered ready.
Authored by Craig.
Reviewed By: lebedev.ri
Differential Revision: https://reviews.llvm.org/D112505
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/masked_gather.ll
llvm/test/CodeGen/X86/masked_gather_scatter.ll
llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
llvm/test/CodeGen/X86/pr45067.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9f8aaca38a7ad..15eec7a697268 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31116,6 +31116,10 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
Mask = ExtendToType(Mask, MaskVT, DAG, true);
}
+ // Break dependency on the data register.
+ if (PassThru.isUndef())
+ PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+
SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
N->getScale() };
SDValue NewGather = DAG.getMemIntrinsicNode(
diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll
index 2f00b80bb76bb..fc7618bf9e8e7 100644
--- a/llvm/test/CodeGen/X86/masked_gather.ll
+++ b/llvm/test/CodeGen/X86/masked_gather.ll
@@ -1748,24 +1748,28 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: kmovw %k1, %k2
-; AVX512F-NEXT: vpgatherdd c(,%zmm0), %zmm1 {%k2}
+; AVX512F-NEXT: vpgatherdd c(,%zmm0), %zmm2 {%k2}
; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28]
-; AVX512F-NEXT: vpgatherdd c(,%zmm0), %zmm2 {%k1}
-; AVX512F-NEXT: vpaddd %ymm2, %ymm2, %ymm0
-; AVX512F-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpgatherdd c(,%zmm0), %zmm1 {%k1}
+; AVX512F-NEXT: vpaddd %ymm1, %ymm1, %ymm0
+; AVX512F-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: gather_v8i32_v8i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm0 = [12,12,12,12,12,12,12,12]
+; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12]
; AVX512VL-NEXT: kmovw %k1, %k2
-; AVX512VL-NEXT: vpgatherdd c(,%ymm0), %ymm1 {%k2}
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm0 = [28,28,28,28,28,28,28,28]
-; AVX512VL-NEXT: vpgatherdd c(,%ymm0), %ymm2 {%k1}
-; AVX512VL-NEXT: vpaddd %ymm2, %ymm2, %ymm0
-; AVX512VL-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: vpgatherdd c(,%ymm1), %ymm2 {%k2}
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [28,28,28,28,28,28,28,28]
+; AVX512VL-NEXT: vpgatherdd c(,%ymm1), %ymm0 {%k1}
+; AVX512VL-NEXT: vpaddd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%1 = icmp eq <8 x i32> %trigger, zeroinitializer
%2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> getelementptr (%struct.a, <8 x %struct.a*> <%struct.a* @c, %struct.a* @c, %struct.a* @c, %struct.a* @c, %struct.a* @c, %struct.a* @c, %struct.a* @c, %struct.a* @c>, <8 x i64> zeroinitializer, i32 0, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>), i32 4, <8 x i1> %1, <8 x i32> undef)
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 12d545099d216..d6c3f8625ffec 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -21,6 +21,7 @@ define <16 x float> @test1(float* %base, <16 x i32> %ind) {
; KNL_64-LABEL: test1:
; KNL_64: # %bb.0:
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vmovaps %zmm1, %zmm0
; KNL_64-NEXT: retq
@@ -29,6 +30,7 @@ define <16 x float> @test1(float* %base, <16 x i32> %ind) {
; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; KNL_32-NEXT: vmovaps %zmm1, %zmm0
; KNL_32-NEXT: retl
@@ -36,6 +38,7 @@ define <16 x float> @test1(float* %base, <16 x i32> %ind) {
; SKX-LABEL: test1:
; SKX: # %bb.0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq
@@ -44,6 +47,7 @@ define <16 x float> @test1(float* %base, <16 x i32> %ind) {
; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; SKX_32-NEXT: vmovaps %zmm1, %zmm0
; SKX_32-NEXT: retl
@@ -78,6 +82,7 @@ define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
; KNL_64-LABEL: test2:
; KNL_64: # %bb.0:
; KNL_64-NEXT: kmovw %esi, %k1
+; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vmovaps %zmm1, %zmm0
; KNL_64-NEXT: retq
@@ -86,6 +91,7 @@ define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; KNL_32-NEXT: vmovaps %zmm1, %zmm0
; KNL_32-NEXT: retl
@@ -93,6 +99,7 @@ define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
; SKX-LABEL: test2:
; SKX: # %bb.0:
; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq
@@ -101,6 +108,7 @@ define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; SKX_32-NEXT: vmovaps %zmm1, %zmm0
; SKX_32-NEXT: retl
@@ -119,6 +127,7 @@ define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
; KNL_64-LABEL: test3:
; KNL_64: # %bb.0:
; KNL_64-NEXT: kmovw %esi, %k1
+; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_64-NEXT: retq
@@ -127,6 +136,7 @@ define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_32-NEXT: retl
@@ -134,6 +144,7 @@ define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
; SKX-LABEL: test3:
; SKX: # %bb.0:
; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
; SKX-NEXT: retq
@@ -142,6 +153,7 @@ define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
; SKX_32-NEXT: retl
@@ -161,6 +173,7 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
; KNL_64-LABEL: test4:
; KNL_64: # %bb.0:
; KNL_64-NEXT: kmovw %esi, %k1
+; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL_64-NEXT: kmovw %k1, %k2
; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
@@ -172,6 +185,7 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL_32-NEXT: kmovw %k1, %k2
; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
@@ -182,6 +196,7 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
; SKX-LABEL: test4:
; SKX: # %bb.0:
; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX-NEXT: kmovw %k1, %k2
; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
; SKX-NEXT: vmovdqa64 %zmm1, %zmm2
@@ -193,6 +208,7 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: kmovw %k1, %k2
; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
@@ -292,6 +308,7 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
; KNL_64-LABEL: test6:
; KNL_64: # %bb.0:
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL_64-NEXT: kxnorw %k0, %k0, %k2
; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
@@ -302,6 +319,7 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
; KNL_32: # %bb.0:
; KNL_32-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL_32-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL_32-NEXT: movw $255, %ax
; KNL_32-NEXT: kmovw %eax, %k1
; KNL_32-NEXT: kmovw %k1, %k2
@@ -313,6 +331,7 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
; SKX-LABEL: test6:
; SKX: # %bb.0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: kxnorw %k0, %k0, %k2
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
@@ -322,6 +341,7 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
; SKX_32-LABEL: test6:
; SKX_32: # %bb.0:
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX_32-NEXT: kxnorw %k0, %k0, %k2
; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2}
; SKX_32-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1}
@@ -342,6 +362,7 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
; KNL_64-NEXT: kmovw %esi, %k0
; KNL_64-NEXT: kshiftlw $8, %k0, %k0
; KNL_64-NEXT: kshiftrw $8, %k0, %k1
+; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL_64-NEXT: kmovw %k1, %k2
; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
@@ -357,6 +378,7 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
; KNL_32-NEXT: kmovw %ecx, %k0
; KNL_32-NEXT: kshiftlw $8, %k0, %k0
; KNL_32-NEXT: kshiftrw $8, %k0, %k1
+; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL_32-NEXT: kmovw %k1, %k2
; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
@@ -367,6 +389,7 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
; SKX-LABEL: test7:
; SKX: # %bb.0:
; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX-NEXT: kmovw %k1, %k2
; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
; SKX-NEXT: vmovdqa %ymm1, %ymm2
@@ -378,6 +401,7 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1
+; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: kmovw %k1, %k2
; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm1 {%k2}
; SKX_32-NEXT: vmovdqa %ymm1, %ymm2
@@ -403,20 +427,23 @@ define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
; KNL_64: # %bb.0:
; KNL_64-NEXT: kmovw %edi, %k1
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL_64-NEXT: kmovw %k2, %k3
-; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
+; KNL_64-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k3}
; KNL_64-NEXT: kmovw %k1, %k3
-; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
-; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
-; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
-; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
-; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
+; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k3}
+; KNL_64-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm4
+; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k2}
+; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1}
+; KNL_64-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0
; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test8:
; KNL_32: # %bb.0:
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL_32-NEXT: kmovw %k1, %k2
; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
@@ -428,20 +455,23 @@ define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
; SKX: # %bb.0:
; SKX-NEXT: kmovw %edi, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: kmovw %k2, %k3
-; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; SKX-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k3}
; SKX-NEXT: kmovw %k1, %k3
-; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
-; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
-; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
-; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
-; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k3}
+; SKX-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm4
+; SKX-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k2}
+; SKX-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1}
+; SKX-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0
; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test8:
; SKX_32: # %bb.0:
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: kmovw %k1, %k2
; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
@@ -478,6 +508,7 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm1
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_64-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
; KNL_64-NEXT: retq
;
@@ -491,6 +522,7 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1
+; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_32-NEXT: movw $255, %ax
; KNL_32-NEXT: kmovw %eax, %k1
; KNL_32-NEXT: vpgatherdd 68(,%zmm1), %zmm0 {%k1}
@@ -506,6 +538,7 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm1
; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
+; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
; SKX_SMALL-NEXT: retq
;
@@ -520,6 +553,7 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm1
; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
+; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
; SKX_LARGE-NEXT: retq
;
@@ -531,6 +565,7 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_32-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1}
; SKX_32-NEXT: retl
entry:
@@ -557,6 +592,7 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm1
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_64-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
; KNL_64-NEXT: retq
;
@@ -570,6 +606,7 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1
+; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_32-NEXT: movw $255, %ax
; KNL_32-NEXT: kmovw %eax, %k1
; KNL_32-NEXT: vpgatherdd 68(,%zmm1), %zmm0 {%k1}
@@ -585,6 +622,7 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm1
; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
+; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
; SKX_SMALL-NEXT: retq
;
@@ -599,6 +637,7 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm1
; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
+; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
; SKX_LARGE-NEXT: retq
;
@@ -610,6 +649,7 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_32-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1}
; SKX_32-NEXT: retl
entry:
@@ -629,6 +669,7 @@ define <16 x float> @test11(float* %base, i32 %ind) {
; KNL_64-NEXT: leaq (%rdi,%rax,4), %rax
; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; KNL_64-NEXT: vgatherdps (%rax,%zmm1,4), %zmm0 {%k1}
; KNL_64-NEXT: retq
;
@@ -639,6 +680,7 @@ define <16 x float> @test11(float* %base, i32 %ind) {
; KNL_32-NEXT: addl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; KNL_32-NEXT: retl
;
@@ -648,6 +690,7 @@ define <16 x float> @test11(float* %base, i32 %ind) {
; SKX-NEXT: leaq (%rdi,%rax,4), %rax
; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; SKX-NEXT: vgatherdps (%rax,%zmm1,4), %zmm0 {%k1}
; SKX-NEXT: retq
;
@@ -658,6 +701,7 @@ define <16 x float> @test11(float* %base, i32 %ind) {
; SKX_32-NEXT: addl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; SKX_32-NEXT: retl
@@ -675,6 +719,7 @@ define <16 x float> @test12(float* %base, <16 x i32> %ind) {
; KNL_64-LABEL: test12:
; KNL_64: # %bb.0:
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vmovaps %zmm1, %zmm0
; KNL_64-NEXT: retq
@@ -683,6 +728,7 @@ define <16 x float> @test12(float* %base, <16 x i32> %ind) {
; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; KNL_32-NEXT: vmovaps %zmm1, %zmm0
; KNL_32-NEXT: retl
@@ -690,6 +736,7 @@ define <16 x float> @test12(float* %base, <16 x i32> %ind) {
; SKX-LABEL: test12:
; SKX: # %bb.0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq
@@ -698,6 +745,7 @@ define <16 x float> @test12(float* %base, <16 x i32> %ind) {
; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; SKX_32-NEXT: vmovaps %zmm1, %zmm0
; SKX_32-NEXT: retl
@@ -714,6 +762,7 @@ define <16 x float> @test13(float* %base, <16 x i32> %ind) {
; KNL_64-LABEL: test13:
; KNL_64: # %bb.0:
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vmovaps %zmm1, %zmm0
; KNL_64-NEXT: retq
@@ -722,6 +771,7 @@ define <16 x float> @test13(float* %base, <16 x i32> %ind) {
; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; KNL_32-NEXT: vmovaps %zmm1, %zmm0
; KNL_32-NEXT: retl
@@ -729,6 +779,7 @@ define <16 x float> @test13(float* %base, <16 x i32> %ind) {
; SKX-LABEL: test13:
; SKX: # %bb.0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq
@@ -737,6 +788,7 @@ define <16 x float> @test13(float* %base, <16 x i32> %ind) {
; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; SKX_32-NEXT: vmovaps %zmm1, %zmm0
; SKX_32-NEXT: retl
@@ -758,6 +810,7 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_64-NEXT: vpsllq $2, %zmm0, %zmm0
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL_64-NEXT: vgatherqps (%rax,%zmm0), %ymm1 {%k1}
; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
; KNL_64-NEXT: retq
@@ -767,6 +820,7 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
; KNL_32-NEXT: vmovd %xmm0, %eax
; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_32-NEXT: vgatherdps (%eax,%zmm1), %zmm0 {%k1}
; KNL_32-NEXT: retl
;
@@ -777,6 +831,7 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
; SKX-NEXT: vpmovsxdq %ymm0, %zmm0
; SKX-NEXT: vpsllq $2, %zmm0, %zmm0
; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX-NEXT: vgatherqps (%rax,%zmm0), %ymm1 {%k1}
; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
; SKX-NEXT: retq
@@ -786,6 +841,7 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
; SKX_32-NEXT: vmovd %xmm0, %eax
; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_32-NEXT: vgatherdps (%eax,%zmm1), %zmm0 {%k1}
; SKX_32-NEXT: retl
@@ -895,6 +951,7 @@ define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vpmovd2m %xmm1, %k1
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
; SKX-NEXT: vmovaps %xmm1, %xmm0
; SKX-NEXT: retq
@@ -904,6 +961,7 @@ define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
; SKX_32-NEXT: vpmovd2m %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
; SKX_32-NEXT: vmovaps %xmm1, %xmm0
; SKX_32-NEXT: retl
@@ -2379,6 +2437,7 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
define <16 x float> @test29(float* %base, <16 x i32> %ind) {
; KNL_64-LABEL: test29:
; KNL_64: # %bb.0:
+; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL_64-NEXT: movw $44, %ax
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
@@ -2388,6 +2447,7 @@ define <16 x float> @test29(float* %base, <16 x i32> %ind) {
; KNL_32-LABEL: test29:
; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL_32-NEXT: movw $44, %cx
; KNL_32-NEXT: kmovw %ecx, %k1
; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
@@ -2396,6 +2456,7 @@ define <16 x float> @test29(float* %base, <16 x i32> %ind) {
;
; SKX-LABEL: test29:
; SKX: # %bb.0:
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX-NEXT: movw $44, %ax
; SKX-NEXT: kmovw %eax, %k1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
@@ -2405,6 +2466,7 @@ define <16 x float> @test29(float* %base, <16 x i32> %ind) {
; SKX_32-LABEL: test29:
; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: movw $44, %cx
; SKX_32-NEXT: kmovw %ecx, %k1
; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
@@ -2668,16 +2730,19 @@ define <16 x float*> @test31(<16 x float**> %ptrs) {
; KNL_64-LABEL: test31:
; KNL_64: # %bb.0:
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; KNL_64-NEXT: vpxor %xmm3, %xmm3, %xmm3
; KNL_64-NEXT: kxnorw %k0, %k0, %k2
-; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
-; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
-; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0
-; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm1
+; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2}
+; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1}
+; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm0
+; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm1
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test31:
; KNL_32: # %bb.0:
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_32-NEXT: retl
@@ -2685,16 +2750,19 @@ define <16 x float*> @test31(<16 x float**> %ptrs) {
; SKX-LABEL: test31:
; SKX: # %bb.0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3
; SKX-NEXT: kxnorw %k0, %k0, %k2
-; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
-; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
-; SKX-NEXT: vmovdqa64 %zmm2, %zmm0
-; SKX-NEXT: vmovdqa64 %zmm3, %zmm1
+; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2}
+; SKX-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1}
+; SKX-NEXT: vmovdqa64 %zmm3, %zmm0
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm1
; SKX-NEXT: retq
;
; SKX_32-LABEL: test31:
; SKX_32: # %bb.0:
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
; SKX_32-NEXT: retl
@@ -3425,6 +3493,7 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vpmovd2m %xmm1, %k1
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1}
; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0
; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0
@@ -3441,6 +3510,7 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; SKX_32-NEXT: subl $32, %esp
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
; SKX_32-NEXT: vpmovd2m %xmm1, %k1
+; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1}
; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
@@ -3461,6 +3531,7 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) {
; KNL_64-LABEL: test_global_array:
; KNL_64: # %bb.0:
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
; KNL_64-NEXT: vmovdqa %ymm1, %ymm0
; KNL_64-NEXT: retq
@@ -3468,6 +3539,7 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) {
; KNL_32-LABEL: test_global_array:
; KNL_32: # %bb.0:
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
; KNL_32-NEXT: vmovdqa %ymm1, %ymm0
; KNL_32-NEXT: retl
@@ -3475,6 +3547,7 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) {
; SKX_SMALL-LABEL: test_global_array:
; SKX_SMALL: # %bb.0:
; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
+; SKX_SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0
; SKX_SMALL-NEXT: retq
@@ -3483,6 +3556,7 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) {
; SKX_LARGE: # %bb.0:
; SKX_LARGE-NEXT: movabsq $glob_array, %rax
; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
+; SKX_LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0
; SKX_LARGE-NEXT: retq
@@ -3490,6 +3564,7 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) {
; SKX_32-LABEL: test_global_array:
; SKX_32: # %bb.0:
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
; SKX_32-NEXT: vmovdqa %ymm1, %ymm0
; SKX_32-NEXT: retl
@@ -3502,6 +3577,7 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
; KNL_64-LABEL: test_global_array_zeroinitializer_index:
; KNL_64: # %bb.0:
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
; KNL_64-NEXT: vmovdqa %ymm1, %ymm0
; KNL_64-NEXT: retq
@@ -3509,6 +3585,7 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
; KNL_32-LABEL: test_global_array_zeroinitializer_index:
; KNL_32: # %bb.0:
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
; KNL_32-NEXT: vmovdqa %ymm1, %ymm0
; KNL_32-NEXT: retl
@@ -3516,6 +3593,7 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
; SKX_SMALL-LABEL: test_global_array_zeroinitializer_index:
; SKX_SMALL: # %bb.0:
; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
+; SKX_SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0
; SKX_SMALL-NEXT: retq
@@ -3524,6 +3602,7 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
; SKX_LARGE: # %bb.0:
; SKX_LARGE-NEXT: movabsq $glob_array, %rax
; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
+; SKX_LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0
; SKX_LARGE-NEXT: retq
@@ -3531,6 +3610,7 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
; SKX_32-LABEL: test_global_array_zeroinitializer_index:
; SKX_32: # %bb.0:
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
; SKX_32-NEXT: vmovdqa %ymm1, %ymm0
; SKX_32-NEXT: retl
@@ -3750,6 +3830,7 @@ define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {
; KNL_64: # %bb.0:
; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm1
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; KNL_64-NEXT: retq
;
@@ -3758,6 +3839,7 @@ define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm1
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; KNL_32-NEXT: retl
;
@@ -3765,6 +3847,7 @@ define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {
; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm0, %zmm1
; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; SKX-NEXT: retq
;
@@ -3773,6 +3856,7 @@ define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; SKX_32-NEXT: retl
@@ -3788,6 +3872,7 @@ define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) {
; KNL_64-LABEL: sext_v8i8_index:
; KNL_64: # %bb.0:
; KNL_64-NEXT: vpmovsxbd %xmm0, %ymm1
+; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_64-NEXT: movw $255, %ax
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
@@ -3798,6 +3883,7 @@ define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) {
; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxbd %xmm0, %ymm1
+; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_32-NEXT: movw $255, %cx
; KNL_32-NEXT: kmovw %ecx, %k1
; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
@@ -3808,6 +3894,7 @@ define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) {
; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm0, %ymm1
; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
; SKX-NEXT: retq
;
@@ -3816,6 +3903,7 @@ define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) {
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vpmovsxbd %xmm0, %ymm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
; SKX_32-NEXT: retl
@@ -3833,6 +3921,7 @@ define <16 x float> @zext_i8_index(float* %base, <16 x i8> %ind) {
; KNL_64: # %bb.0:
; KNL_64-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; KNL_64-NEXT: retq
;
@@ -3841,6 +3930,7 @@ define <16 x float> @zext_i8_index(float* %base, <16 x i8> %ind) {
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; KNL_32-NEXT: retl
;
@@ -3848,6 +3938,7 @@ define <16 x float> @zext_i8_index(float* %base, <16 x i8> %ind) {
; SKX: # %bb.0:
; SKX-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; SKX-NEXT: retq
;
@@ -3856,6 +3947,7 @@ define <16 x float> @zext_i8_index(float* %base, <16 x i8> %ind) {
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; SKX_32-NEXT: retl
@@ -3871,6 +3963,7 @@ define <8 x float> @zext_v8i8_index(float* %base, <8 x i8> %ind) {
; KNL_64-LABEL: zext_v8i8_index:
; KNL_64: # %bb.0:
; KNL_64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_64-NEXT: movw $255, %ax
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
@@ -3881,6 +3974,7 @@ define <8 x float> @zext_v8i8_index(float* %base, <8 x i8> %ind) {
; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_32-NEXT: movw $255, %cx
; KNL_32-NEXT: kmovw %ecx, %k1
; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
@@ -3891,6 +3985,7 @@ define <8 x float> @zext_v8i8_index(float* %base, <8 x i8> %ind) {
; SKX: # %bb.0:
; SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
; SKX-NEXT: retq
;
@@ -3899,6 +3994,7 @@ define <8 x float> @zext_v8i8_index(float* %base, <8 x i8> %ind) {
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
; SKX_32-NEXT: retl
@@ -4027,6 +4123,7 @@ define <16 x float> @zext_index(float* %base, <16 x i32> %ind) {
; KNL_64: # %bb.0:
; KNL_64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; KNL_64-NEXT: retq
;
@@ -4035,6 +4132,7 @@ define <16 x float> @zext_index(float* %base, <16 x i32> %ind) {
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm1
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; KNL_32-NEXT: retl
;
@@ -4042,6 +4140,7 @@ define <16 x float> @zext_index(float* %base, <16 x i32> %ind) {
; SKX_SMALL: # %bb.0:
; SKX_SMALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1
; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
+; SKX_SMALL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; SKX_SMALL-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; SKX_SMALL-NEXT: retq
;
@@ -4050,6 +4149,7 @@ define <16 x float> @zext_index(float* %base, <16 x i32> %ind) {
; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; SKX_LARGE-NEXT: vandps (%rax){1to16}, %zmm0, %zmm1
; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
+; SKX_LARGE-NEXT: vxorps %xmm0, %xmm0, %xmm0
; SKX_LARGE-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; SKX_LARGE-NEXT: retq
;
@@ -4058,6 +4158,7 @@ define <16 x float> @zext_index(float* %base, <16 x i32> %ind) {
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; SKX_32-NEXT: retl
%ind_masked = and <16 x i32> %ind, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
@@ -4230,6 +4331,7 @@ define <16 x float> @test_sext_cse(float* %base, <16 x i32> %ind, <16 x i32>* %f
; KNL_64: # %bb.0:
; KNL_64-NEXT: vmovaps %zmm0, (%rsi)
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vaddps %zmm1, %zmm1, %zmm0
; KNL_64-NEXT: retq
@@ -4240,6 +4342,7 @@ define <16 x float> @test_sext_cse(float* %base, <16 x i32> %ind, <16 x i32>* %f
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; KNL_32-NEXT: vmovaps %zmm0, (%ecx)
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; KNL_32-NEXT: vaddps %zmm1, %zmm1, %zmm0
; KNL_32-NEXT: retl
@@ -4248,6 +4351,7 @@ define <16 x float> @test_sext_cse(float* %base, <16 x i32> %ind, <16 x i32>* %f
; SKX: # %bb.0:
; SKX-NEXT: vmovaps %zmm0, (%rsi)
; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vaddps %zmm1, %zmm1, %zmm0
; SKX-NEXT: retq
@@ -4258,6 +4362,7 @@ define <16 x float> @test_sext_cse(float* %base, <16 x i32> %ind, <16 x i32>* %f
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; SKX_32-NEXT: vmovaps %zmm0, (%ecx)
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; SKX_32-NEXT: vaddps %zmm1, %zmm1, %zmm0
; SKX_32-NEXT: retl
@@ -5112,6 +5217,7 @@ define <8 x i64> @pr45906(<8 x %struct.foo*> %ptr) {
; KNL_64-LABEL: pr45906:
; KNL_64: # %bb.0: # %bb
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL_64-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1}
; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_64-NEXT: retq
@@ -5119,6 +5225,7 @@ define <8 x i64> @pr45906(<8 x %struct.foo*> %ptr) {
; KNL_32-LABEL: pr45906:
; KNL_32: # %bb.0: # %bb
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL_32-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1}
; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_32-NEXT: retl
@@ -5126,6 +5233,7 @@ define <8 x i64> @pr45906(<8 x %struct.foo*> %ptr) {
; SKX-LABEL: pr45906:
; SKX: # %bb.0: # %bb
; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1}
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
; SKX-NEXT: retq
@@ -5133,6 +5241,7 @@ define <8 x i64> @pr45906(<8 x %struct.foo*> %ptr) {
; SKX_32-LABEL: pr45906:
; SKX_32: # %bb.0: # %bb
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1}
; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
; SKX_32-NEXT: retl
@@ -5142,69 +5251,3 @@ bb:
ret <8 x i64> %tmp1
}
declare <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*>, i32, <8 x i1>, <8 x i64>)
-
-%struct.ST2 = type { i32, i32 }
-
-; Make sure we don't use a displacement on the gather. The constant from the
-; struct offset should be folded into the constant pool load for the vector
-; add.
-define <8 x i32> @test_const_fold(%struct.ST2* %base, <8 x i64> %i1) {
-; KNL_64-LABEL: test_const_fold:
-; KNL_64: # %bb.0: # %entry
-; KNL_64-NEXT: vpsllq $3, %zmm0, %zmm0
-; KNL_64-NEXT: vpbroadcastq %rdi, %zmm1
-; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
-; KNL_64-NEXT: kxnorw %k0, %k0, %k1
-; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
-; KNL_64-NEXT: retq
-;
-; KNL_32-LABEL: test_const_fold:
-; KNL_32: # %bb.0: # %entry
-; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
-; KNL_32-NEXT: vpslld $3, %ymm0, %ymm0
-; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm1
-; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; KNL_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1
-; KNL_32-NEXT: movw $255, %ax
-; KNL_32-NEXT: kmovw %eax, %k1
-; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1}
-; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; KNL_32-NEXT: retl
-;
-; SKX_SMALL-LABEL: test_const_fold:
-; SKX_SMALL: # %bb.0: # %entry
-; SKX_SMALL-NEXT: vpsllq $3, %zmm0, %zmm0
-; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm1
-; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
-; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
-; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
-; SKX_SMALL-NEXT: retq
-;
-; SKX_LARGE-LABEL: test_const_fold:
-; SKX_LARGE: # %bb.0: # %entry
-; SKX_LARGE-NEXT: vpsllq $3, %zmm0, %zmm0
-; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm1
-; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
-; SKX_LARGE-NEXT: vpaddq (%rax), %zmm0, %zmm1
-; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
-; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
-; SKX_LARGE-NEXT: retq
-;
-; SKX_32-LABEL: test_const_fold:
-; SKX_32: # %bb.0: # %entry
-; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
-; SKX_32-NEXT: vpslld $3, %ymm0, %ymm0
-; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
-; SKX_32-NEXT: kxnorw %k0, %k0, %k1
-; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1
-; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1}
-; SKX_32-NEXT: retl
-entry:
- %add = add <8 x i64> %i1, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
- %arrayidx = getelementptr %struct.ST2, %struct.ST2* %base, <8 x i64> %add, i32 1
- %res = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
- ret <8 x i32> %res
-}
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
index 3e6112fe26346..48ef8bdbccd19 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
@@ -664,12 +664,14 @@ define <17 x float> @test_mgather_v17f32(float* %base, <17 x i32> %index)
; WIDEN_SKX-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; WIDEN_SKX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; WIDEN_SKX-NEXT: kxnorw %k0, %k0, %k1
-; WIDEN_SKX-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm2 {%k1}
+; WIDEN_SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; WIDEN_SKX-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; WIDEN_SKX-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm3 {%k1}
; WIDEN_SKX-NEXT: movw $1, %cx
; WIDEN_SKX-NEXT: kmovw %ecx, %k1
-; WIDEN_SKX-NEXT: vgatherdps (%rsi,%zmm1,4), %zmm0 {%k1}
-; WIDEN_SKX-NEXT: vmovss %xmm0, 64(%rdi)
-; WIDEN_SKX-NEXT: vmovaps %zmm2, (%rdi)
+; WIDEN_SKX-NEXT: vgatherdps (%rsi,%zmm1,4), %zmm2 {%k1}
+; WIDEN_SKX-NEXT: vmovss %xmm2, 64(%rdi)
+; WIDEN_SKX-NEXT: vmovaps %zmm3, (%rdi)
; WIDEN_SKX-NEXT: vzeroupper
; WIDEN_SKX-NEXT: retq
;
@@ -697,12 +699,14 @@ define <17 x float> @test_mgather_v17f32(float* %base, <17 x i32> %index)
; WIDEN_KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; WIDEN_KNL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; WIDEN_KNL-NEXT: kxnorw %k0, %k0, %k1
-; WIDEN_KNL-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm2 {%k1}
+; WIDEN_KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; WIDEN_KNL-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; WIDEN_KNL-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm3 {%k1}
; WIDEN_KNL-NEXT: movw $1, %cx
; WIDEN_KNL-NEXT: kmovw %ecx, %k1
-; WIDEN_KNL-NEXT: vgatherdps (%rsi,%zmm1,4), %zmm0 {%k1}
-; WIDEN_KNL-NEXT: vmovss %xmm0, 64(%rdi)
-; WIDEN_KNL-NEXT: vmovaps %zmm2, (%rdi)
+; WIDEN_KNL-NEXT: vgatherdps (%rsi,%zmm1,4), %zmm2 {%k1}
+; WIDEN_KNL-NEXT: vmovss %xmm2, 64(%rdi)
+; WIDEN_KNL-NEXT: vmovaps %zmm3, (%rdi)
; WIDEN_KNL-NEXT: vzeroupper
; WIDEN_KNL-NEXT: retq
;
@@ -711,32 +715,35 @@ define <17 x float> @test_mgather_v17f32(float* %base, <17 x i32> %index)
; WIDEN_AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; WIDEN_AVX2-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; WIDEN_AVX2-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; WIDEN_AVX2-NEXT: movq %rdi, %rax
; WIDEN_AVX2-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; WIDEN_AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; WIDEN_AVX2-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; WIDEN_AVX2-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
; WIDEN_AVX2-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; WIDEN_AVX2-NEXT: movq %rdi, %rax
-; WIDEN_AVX2-NEXT: vmovd %edx, %xmm2
-; WIDEN_AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
-; WIDEN_AVX2-NEXT: vpinsrd $2, %r8d, %xmm2, %xmm2
-; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; WIDEN_AVX2-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2
-; WIDEN_AVX2-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; WIDEN_AVX2-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; WIDEN_AVX2-NEXT: vmovd %edx, %xmm3
+; WIDEN_AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3
+; WIDEN_AVX2-NEXT: vpinsrd $2, %r8d, %xmm3, %xmm3
+; WIDEN_AVX2-NEXT: vpinsrd $3, %r9d, %xmm3, %xmm3
; WIDEN_AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; WIDEN_AVX2-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm4, %xmm1
+; WIDEN_AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; WIDEN_AVX2-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; WIDEN_AVX2-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
; WIDEN_AVX2-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; WIDEN_AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; WIDEN_AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; WIDEN_AVX2-NEXT: vgatherdps %ymm2, (%rsi,%ymm1,4), %ymm4
-; WIDEN_AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; WIDEN_AVX2-NEXT: vgatherdps %ymm1, (%rsi,%ymm0,4), %ymm2
+; WIDEN_AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
+; WIDEN_AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; WIDEN_AVX2-NEXT: vxorps %xmm4, %xmm4, %xmm4
+; WIDEN_AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
+; WIDEN_AVX2-NEXT: vxorps %xmm6, %xmm6, %xmm6
+; WIDEN_AVX2-NEXT: vgatherdps %ymm5, (%rsi,%ymm1,4), %ymm6
+; WIDEN_AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; WIDEN_AVX2-NEXT: vgatherdps %ymm3, (%rsi,%ymm0,4), %ymm1
; WIDEN_AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0]
-; WIDEN_AVX2-NEXT: vgatherdps %ymm0, (%rsi,%ymm3,4), %ymm1
-; WIDEN_AVX2-NEXT: vmovss %xmm1, 64(%rdi)
-; WIDEN_AVX2-NEXT: vmovaps %ymm2, 32(%rdi)
-; WIDEN_AVX2-NEXT: vmovaps %ymm4, (%rdi)
+; WIDEN_AVX2-NEXT: vgatherdps %ymm0, (%rsi,%ymm2,4), %ymm4
+; WIDEN_AVX2-NEXT: vmovss %xmm4, 64(%rdi)
+; WIDEN_AVX2-NEXT: vmovaps %ymm1, 32(%rdi)
+; WIDEN_AVX2-NEXT: vmovaps %ymm6, (%rdi)
; WIDEN_AVX2-NEXT: vzeroupper
; WIDEN_AVX2-NEXT: retq
{
diff --git a/llvm/test/CodeGen/X86/pr45067.ll b/llvm/test/CodeGen/X86/pr45067.ll
index 1731213916981..17a25c628234d 100644
--- a/llvm/test/CodeGen/X86/pr45067.ll
+++ b/llvm/test/CodeGen/X86/pr45067.ll
@@ -7,8 +7,9 @@ define void @foo(<8 x i32>* %x, <8 x i1> %y) {
; CHECK-LABEL: foo:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: movq _global at GOTPCREL(%rip), %rax
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpgatherdd %ymm1, (%rax,%ymm2), %ymm3
; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-NEXT: vpslld $31, %ymm0, %ymm0
More information about the llvm-commits
mailing list