[llvm] r322308 - [X86] Legalize 128/256 gathers/scatters on KNL by using widening rather than sign extending the index.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 11 11:38:30 PST 2018
Author: ctopper
Date: Thu Jan 11 11:38:30 2018
New Revision: 322308
URL: http://llvm.org/viewvc/llvm-project?rev=322308&view=rev
Log:
[X86] Legalize 128/256 gathers/scatters on KNL by using widening rather than sign extending the index.
We can just widen the vectors with undef and zero extend the mask.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=322308&r1=322307&r2=322308&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Jan 11 11:38:30 2018
@@ -24385,47 +24385,32 @@ static SDValue LowerMSCATTER(SDValue Op,
}
MVT IndexVT = Index.getSimpleValueType();
+ MVT MaskVT = Mask.getSimpleValueType();
// If the index is v2i32, we're being called by type legalization and we
// should just let the default handling take care of it.
if (IndexVT == MVT::v2i32)
return SDValue();
- unsigned NumElts = VT.getVectorNumElements();
+ // If we don't have VLX and neither the passthru or index is 512-bits, we
+ // need to widen until one is.
if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
!Index.getSimpleValueType().is512BitVector()) {
- // AVX512F supports only 512-bit vectors. Or data or index should
- // be 512 bit wide. If now the both index and data are 256-bit, but
- // the vector contains 8 elements, we just sign-extend the index
- if (IndexVT == MVT::v8i32)
- // Just extend index
- Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
- else {
- // The minimal number of elts in scatter is 8
- NumElts = 8;
- // Index
- MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
- // Use original index here, do not modify the index twice
- Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
- if (IndexVT.getScalarType() == MVT::i32)
- Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
-
- // Mask
- // At this point we have promoted mask operand
- assert(Mask.getValueType().getScalarType() == MVT::i1 &&
- "unexpected mask type");
- MVT ExtMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
- // Use the original mask here, do not modify the mask twice
- Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
-
- // The value that should be stored
- MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
- Src = ExtendToType(Src, NewVT, DAG);
- }
+ // Determine how much we need to widen by to get a 512-bit type.
+ unsigned Factor = std::min(512/VT.getSizeInBits(),
+ 512/IndexVT.getSizeInBits());
+ unsigned NumElts = VT.getVectorNumElements() * Factor;
+
+ VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
+ IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
+ MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+
+ Src = ExtendToType(Src, VT, DAG);
+ Index = ExtendToType(Index, IndexVT, DAG);
+ Mask = ExtendToType(Mask, MaskVT, DAG, true);
}
- // The mask is killed by scatter, add it to the values
- SDVTList VTs = DAG.getVTList(Mask.getValueType(), MVT::Other);
+ SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
@@ -24532,68 +24517,46 @@ static SDValue LowerMGATHER(SDValue Op,
MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
- SDValue Scale = N->getScale();
SDValue Index = N->getIndex();
SDValue Mask = N->getMask();
SDValue Src0 = N->getValue();
MVT IndexVT = Index.getSimpleValueType();
MVT MaskVT = Mask.getSimpleValueType();
- unsigned NumElts = VT.getVectorNumElements();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
// If the index is v2i32, we're being called by type legalization.
if (IndexVT == MVT::v2i32)
return SDValue();
+ // If we don't have VLX and neither the passthru or index is 512-bits, we
+ // need to widen until one is.
+ MVT OrigVT = VT;
if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
- !Index.getSimpleValueType().is512BitVector()) {
- // AVX512F supports only 512-bit vectors. Or data or index should
- // be 512 bit wide. If now the both index and data are 256-bit, but
- // the vector contains 8 elements, we just sign-extend the index
- if (NumElts == 8) {
- Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
- SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index,
- Scale };
- SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
- N->getMemOperand());
- return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
- }
-
- // Minimal number of elements in Gather
- NumElts = 8;
- // Index
- MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
- Index = ExtendToType(Index, NewIndexVT, DAG);
- if (IndexVT.getScalarType() == MVT::i32)
- Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+ !IndexVT.is512BitVector()) {
+ // Determine how much we need to widen by to get a 512-bit type.
+ unsigned Factor = std::min(512/VT.getSizeInBits(),
+ 512/IndexVT.getSizeInBits());
+
+ unsigned NumElts = VT.getVectorNumElements() * Factor;
- // Mask
- assert(MaskVT.getScalarType() == MVT::i1 && "unexpected mask type");
+ VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
+ IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
- Mask = ExtendToType(Mask, MaskVT, DAG, true);
- // The pass-through value
- MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
- Src0 = ExtendToType(Src0, NewVT, DAG);
-
- SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index, Scale };
- SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- DAG.getVTList(NewVT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
- N->getMemOperand());
- SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
- NewGather.getValue(0),
- DAG.getIntPtrConstant(0, dl));
- SDValue RetOps[] = {Extract, NewGather.getValue(2)};
- return DAG.getMergeValues(RetOps, dl);
+ Src0 = ExtendToType(Src0, VT, DAG);
+ Index = ExtendToType(Index, IndexVT, DAG);
+ Mask = ExtendToType(Mask, MaskVT, DAG, true);
}
- SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index, Scale };
+ SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index,
+ N->getScale() };
SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
N->getMemOperand());
- return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
+ NewGather, DAG.getIntPtrConstant(0, dl));
+ return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
}
SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
Modified: llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll?rev=322308&r1=322307&r2=322308&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll Thu Jan 11 11:38:30 2018
@@ -299,12 +299,14 @@ define <8 x i32> @test6(<8 x i32>%a1, <8
;
; KNL_32-LABEL: test6:
; KNL_32: # %bb.0:
-; KNL_32-NEXT: kxnorw %k0, %k0, %k1
-; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2
-; KNL_32-NEXT: kxnorw %k0, %k0, %k2
-; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2}
-; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1}
-; KNL_32-NEXT: vmovdqa %ymm1, %ymm0
+; KNL_32-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; KNL_32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; KNL_32-NEXT: movw $255, %ax
+; KNL_32-NEXT: kmovw %eax, %k1
+; KNL_32-NEXT: kmovw %k1, %k2
+; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm2 {%k2}
+; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: vmovdqa %ymm2, %ymm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test6:
@@ -335,25 +337,29 @@ define <8 x i32> @test7(i32* %base, <8 x
;
; KNL_64-LABEL: test7:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: kmovw %esi, %k1
-; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; KNL_64-NEXT: kmovw %esi, %k0
+; KNL_64-NEXT: kshiftlw $8, %k0, %k0
+; KNL_64-NEXT: kshiftrw $8, %k0, %k1
; KNL_64-NEXT: kmovw %k1, %k2
-; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
-; KNL_64-NEXT: vmovdqa %ymm1, %ymm2
-; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
+; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
+; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
+; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test7:
; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; KNL_32-NEXT: kmovw %ecx, %k1
-; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT: kmovw %ecx, %k0
+; KNL_32-NEXT: kshiftlw $8, %k0, %k0
+; KNL_32-NEXT: kshiftrw $8, %k0, %k1
; KNL_32-NEXT: kmovw %k1, %k2
-; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}
-; KNL_32-NEXT: vmovdqa %ymm1, %ymm2
-; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
+; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
+; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
+; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
; KNL_32-NEXT: retl
;
@@ -486,10 +492,11 @@ define <8 x i32> @test9(%struct.ST* %bas
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
-; KNL_32-NEXT: kxnorw %k0, %k0, %k1
-; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1
+; KNL_32-NEXT: movw $255, %ax
+; KNL_32-NEXT: kmovw %eax, %k1
+; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1}
+; KNL_32-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL_32-NEXT: retl
;
; SKX_SMALL-LABEL: test9:
@@ -571,10 +578,11 @@ define <8 x i32> @test10(%struct.ST* %ba
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
-; KNL_32-NEXT: kxnorw %k0, %k0, %k1
-; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1
+; KNL_32-NEXT: movw $255, %ax
+; KNL_32-NEXT: kmovw %eax, %k1
+; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1}
+; KNL_32-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL_32-NEXT: retl
;
; SKX_SMALL-LABEL: test10:
@@ -811,28 +819,26 @@ declare <2 x double> @llvm.masked.gather
define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
; KNL_64-LABEL: test15:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL_64-NEXT: kshiftlw $12, %k0, %k0
; KNL_64-NEXT: kshiftrw $12, %k0, %k1
-; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1
-; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %xmm1, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test15:
; KNL_32: # %bb.0:
-; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL_32-NEXT: kshiftlw $12, %k0, %k0
; KNL_32-NEXT: kshiftrw $12, %k0, %k1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
-; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %xmm1, %xmm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
@@ -869,8 +875,7 @@ define <4 x double> @test16(double* %bas
; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL_64-NEXT: kshiftlw $12, %k0, %k0
; KNL_64-NEXT: kshiftrw $12, %k0, %k1
-; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
-; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
+; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1}
; KNL_64-NEXT: vmovapd %ymm2, %ymm0
; KNL_64-NEXT: retq
;
@@ -883,8 +888,7 @@ define <4 x double> @test16(double* %bas
; KNL_32-NEXT: kshiftlw $12, %k0, %k0
; KNL_32-NEXT: kshiftrw $12, %k0, %k1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
-; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
+; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1}
; KNL_32-NEXT: vmovapd %ymm2, %ymm0
; KNL_32-NEXT: retl
;
@@ -989,14 +993,13 @@ define void @test18(<4 x i32>%a1, <4 x i
;
; KNL_32-LABEL: test18:
; KNL_32: # %bb.0:
-; KNL_32-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
-; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; KNL_32-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; KNL_32-NEXT: vpslld $31, %xmm2, %xmm2
; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL_32-NEXT: kshiftlw $12, %k0, %k0
; KNL_32-NEXT: kshiftrw $12, %k0, %k1
-; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
@@ -1081,14 +1084,13 @@ define void @test20(<2 x float>%a1, <2 x
;
; KNL_32-LABEL: test20:
; KNL_32: # %bb.0:
-; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
; KNL_32-NEXT: kshiftlw $14, %k0, %k0
; KNL_32-NEXT: kshiftrw $14, %k0, %k1
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: vscatterdps %zmm0, (,%zmm1) {%k1}
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
@@ -1130,10 +1132,9 @@ define void @test21(<2 x i32>%a1, <2 x i
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
; KNL_32-NEXT: kshiftlw $14, %k0, %k0
; KNL_32-NEXT: kshiftrw $14, %k0, %k1
-; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
@@ -1163,29 +1164,27 @@ declare <2 x float> @llvm.masked.gather.
define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
; KNL_64-LABEL: test22:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: # kill: def %xmm2 killed %xmm2 def %ymm2
+; KNL_64-NEXT: # kill: def %xmm2 killed %xmm2 def %zmm2
+; KNL_64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
; KNL_64-NEXT: kshiftlw $14, %k0, %k0
; KNL_64-NEXT: kshiftrw $14, %k0, %k1
-; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
-; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
; KNL_64-NEXT: vmovaps %xmm2, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test22:
; KNL_32: # %bb.0:
-; KNL_32-NEXT: # kill: def %xmm2 killed %xmm2 def %ymm2
+; KNL_32-NEXT: # kill: def %xmm2 killed %xmm2 def %zmm2
+; KNL_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
; KNL_32-NEXT: kshiftlw $14, %k0, %k0
; KNL_32-NEXT: kshiftrw $14, %k0, %k1
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
-; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
+; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm2 {%k1}
; KNL_32-NEXT: vmovaps %xmm2, %xmm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
@@ -1271,12 +1270,11 @@ define <2 x i32> @test23(i32* %base, <2
; KNL_64: # %bb.0:
; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
-; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; KNL_64-NEXT: kshiftlw $14, %k0, %k0
; KNL_64-NEXT: kshiftrw $14, %k0, %k1
-; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k1}
+; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
@@ -1286,12 +1284,11 @@ define <2 x i32> @test23(i32* %base, <2
; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; KNL_32-NEXT: kshiftlw $14, %k0, %k0
; KNL_32-NEXT: kshiftrw $14, %k0, %k1
-; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k1}
+; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
@@ -1377,10 +1374,9 @@ define <2 x i32> @test24(i32* %base, <2
; KNL_64-LABEL: test24:
; KNL_64: # %bb.0:
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
-; KNL_64-NEXT: movb $3, %al
+; KNL_64-NEXT: movw $3, %ax
; KNL_64-NEXT: kmovw %eax, %k1
-; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k1}
+; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
@@ -1389,10 +1385,9 @@ define <2 x i32> @test24(i32* %base, <2
; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
-; KNL_32-NEXT: movb $3, %cl
+; KNL_32-NEXT: movw $3, %cx
; KNL_32-NEXT: kmovw %ecx, %k1
-; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k1}
+; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
@@ -1531,24 +1526,22 @@ define <2 x i64> @test26(i64* %base, <2
define <2 x float> @test27(float* %base, <2 x i32> %ind) {
; KNL_64-LABEL: test27:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1
-; KNL_64-NEXT: movb $3, %al
+; KNL_64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; KNL_64-NEXT: movw $3, %ax
; KNL_64-NEXT: kmovw %eax, %k1
-; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
+; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test27:
; KNL_32: # %bb.0:
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
-; KNL_32-NEXT: movb $3, %cl
+; KNL_32-NEXT: movw $3, %cx
; KNL_32-NEXT: kmovw %ecx, %k1
-; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
+; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
@@ -1590,10 +1583,9 @@ define void @test28(<2 x i32>%a1, <2 x i
; KNL_32: # %bb.0:
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_32-NEXT: movb $3, %al
+; KNL_32-NEXT: movw $3, %ax
; KNL_32-NEXT: kmovw %eax, %k1
-; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
@@ -2361,8 +2353,7 @@ define <4 x i64> @test_pr28312(<4 x i64*
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL_32-NEXT: kshiftlw $12, %k0, %k0
; KNL_32-NEXT: kshiftrw $12, %k0, %k1
-; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
-; KNL_32-NEXT: vpgatherqq (,%zmm0), %zmm1 {%k1}
+; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k1}
; KNL_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; KNL_32-NEXT: movl %ebp, %esp
@@ -2628,11 +2619,12 @@ define <8 x float> @sext_v8i8_index(floa
; KNL_64-LABEL: sext_v8i8_index:
; KNL_64: # %bb.0:
; KNL_64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vpslld $24, %ymm0, %ymm0
-; KNL_64-NEXT: vpsrad $24, %ymm0, %ymm0
-; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1
-; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
+; KNL_64-NEXT: vpsrad $24, %ymm0, %ymm1
+; KNL_64-NEXT: movw $255, %ax
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
+; KNL_64-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: sext_v8i8_index:
@@ -2640,10 +2632,11 @@ define <8 x float> @sext_v8i8_index(floa
; KNL_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpslld $24, %ymm0, %ymm0
-; KNL_32-NEXT: vpsrad $24, %ymm0, %ymm0
-; KNL_32-NEXT: kxnorw %k0, %k0, %k1
-; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
-; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
+; KNL_32-NEXT: vpsrad $24, %ymm0, %ymm1
+; KNL_32-NEXT: movw $255, %cx
+; KNL_32-NEXT: kmovw %ecx, %k1
+; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
+; KNL_32-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: sext_v8i8_index:
More information about the llvm-commits
mailing list