[llvm] r318641 - [LV][X86] Support of AVX2 Gathers code generation and update the LV with this
Mohammed Agabaria via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 20 00:18:12 PST 2017
Author: magabari
Date: Mon Nov 20 00:18:12 2017
New Revision: 318641
URL: http://llvm.org/viewvc/llvm-project?rev=318641&view=rev
Log:
[LV][X86] Support of AVX2 Gathers code generation and update the LV with this
This patch depends on: https://reviews.llvm.org/D35348
Support of pattern selection of masked gathers of AVX2 (X86\AVX2 code gen)
Update LoopVectorize to generate gathers for AVX2 processors.
Reviewers: delena, zvi, RKSimon, craig.topper, aaboud, igorb
Reviewed By: delena, RKSimon
Differential Revision: https://reviews.llvm.org/D35772
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td
llvm/trunk/lib/Target/X86/X86InstrSSE.td
llvm/trunk/lib/Target/X86/X86Subtarget.h
llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/trunk/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=318641&r1=318640&r2=318641&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Nov 20 00:18:12 2017
@@ -970,6 +970,10 @@ X86TargetLowering::X86TargetLowering(con
setOperationAction(ISD::BITREVERSE, VT, Custom);
}
+ // Special handling for masked gather of 2 elements
+ if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
+ setOperationAction(ISD::MGATHER, MVT::v2i64, Custom);
+
if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
bool HasInt256 = Subtarget.hasInt256();
@@ -24301,8 +24305,8 @@ static SDValue LowerMSTORE(SDValue Op, c
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- assert(Subtarget.hasAVX512() &&
- "MGATHER/MSCATTER are supported on AVX-512 arch only");
+ assert(Subtarget.hasAVX2() &&
+ "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
SDLoc dl(Op);
@@ -24316,7 +24320,7 @@ static SDValue LowerMGATHER(SDValue Op,
unsigned NumElts = VT.getVectorNumElements();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
- if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
+ if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
!Index.getSimpleValueType().is512BitVector()) {
// AVX512F supports only 512-bit vectors. Or data or index should
// be 512 bit wide. If now the both index and data are 256-bit, but
@@ -24359,7 +24363,7 @@ static SDValue LowerMGATHER(SDValue Op,
SDValue RetOps[] = {Extract, NewGather.getValue(1)};
return DAG.getMergeValues(RetOps, dl);
}
- if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {
+ if (N->getMemoryVT() == MVT::v2i32) {
// There is a special case when the return type is v2i32 is illegal and
// the type legaizer extended it to v2i64. Without this conversion we end up
// with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
@@ -24367,16 +24371,26 @@ static SDValue LowerMGATHER(SDValue Op,
// with index v2i64 and value type v4i32.
assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
"Unexpected type in masked gather");
- Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
- DAG.getBitcast(MVT::v4i32, Src0),
- DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
+ Src0 =
+ DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src0),
+ DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
// The mask should match the destination type. Extending mask with zeroes
// is not necessary since instruction itself reads only two values from
// memory.
+ SDVTList VTList;
+ if (Subtarget.hasVLX()) {
+ Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
+ VTList = DAG.getVTList(MVT::v4i32, MVT::v2i1, MVT::Other);
+ }
+ else {
+ Mask =
+ DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Mask),
+ DAG.getUNDEF(MVT::v4i32), {0, 2, -1, -1});
+ VTList = DAG.getVTList(MVT::v4i32, MVT::Other);
+ }
SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- DAG.getVTList(MVT::v4i32, MVT::v2i1, MVT::Other), Ops, dl,
- N->getMemoryVT(), N->getMemOperand());
+ VTList, Ops, dl, N->getMemoryVT(), N->getMemOperand());
SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
NewGather.getValue(0), DAG);
Modified: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td?rev=318641&r1=318640&r2=318641&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td Mon Nov 20 00:18:12 2017
@@ -1101,3 +1101,91 @@ def masked_truncstore_us_vi32 : PatFrag<
(X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
}]>;
+
+// AVX2 special nodes
+// masked gather of AVX2 where mask elements are i32
+def avx2_x86_masked_gather_32 : SDNode<"X86ISD::MGATHER",
+ SDTypeProfile<2, 3, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>,
+ SDTCisPtrTy<4>, SDTCVecEltisVT<1, i32>, SDTCisSameNumEltsAs<0, 1>]>,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def avx2_masked_gather_32 : SDNode<"ISD::MGATHER",
+ SDTypeProfile<2, 3, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>,
+ SDTCisPtrTy<4>, SDTCVecEltisVT<1, i32>, SDTCisSameNumEltsAs<0, 1>]>,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// masked gather of AVX2 where mask elements are i64
+def avx2_masked_gather_64 : SDNode<"ISD::MGATHER",
+ SDTypeProfile<2, 3, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>,
+ SDTCisPtrTy<4>, SDTCVecEltisVT<1, i64>, SDTCisSameNumEltsAs<0, 1>]>,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// dword gathers
+def avx2_mvpgatherdd_ps_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v4i32 ||
+ Mgt->getBasePtr().getValueType() == MVT::v4i32);
+ return false;
+}]>;
+
+def avx2_mvpgatherqd_ps_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (avx2_x86_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{
+ if (X86MaskedGatherSDNode *Mgt = dyn_cast<X86MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v2i64 ||
+ Mgt->getBasePtr().getValueType() == MVT::v2i64);
+ return false;
+}]>;
+
+def avx2_mvpgatherdd_ps_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v8i32 ||
+ Mgt->getBasePtr().getValueType() == MVT::v8i32);
+ return false;
+}]>;
+
+def avx2_mvpgatherqd_ps_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v4i64 ||
+ Mgt->getBasePtr().getValueType() == MVT::v4i64);
+ return false;
+}]>;
+
+// qwords
+def avx2_mvpgatherdq_pd_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v2i32 ||
+ Mgt->getBasePtr().getValueType() == MVT::v2i32);
+ return false;
+}]>;
+
+def avx2_mvpgatherqq_pd_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v2i64 ||
+ Mgt->getBasePtr().getValueType() == MVT::v2i64) &&
+ Mgt->getMemoryVT().is128BitVector();
+ return false;
+}]>;
+
+def avx2_mvpgatherdq_pd_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v4i32 ||
+ Mgt->getBasePtr().getValueType() == MVT::v4i32);
+ return false;
+}]>;
+
+def avx2_mvpgatherqq_pd_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v4i64 ||
+ Mgt->getBasePtr().getValueType() == MVT::v4i64);
+ return false;
+}]>;
Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=318641&r1=318640&r2=318641&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Mon Nov 20 00:18:12 2017
@@ -8326,36 +8326,52 @@ let Predicates = [HasAVX2, NoVLX] in {
//===----------------------------------------------------------------------===//
// VGATHER - GATHER Operations
-multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
+multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
+ ValueType VTy, PatFrag GatherNode128,
+ PatFrag GatherNode256, RegisterClass RC256,
X86MemOperand memop128, X86MemOperand memop256> {
def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
(ins VR128:$src1, memop128:$src2, VR128:$mask),
!strconcat(OpcodeStr,
"\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
- []>, VEX;
+ [(set (VTx VR128:$dst), VR128:$mask_wb,
+ (GatherNode128 (VTx VR128:$src1), VR128:$mask,
+ vectoraddr:$src2))]>, VEX;
def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
(ins RC256:$src1, memop256:$src2, RC256:$mask),
!strconcat(OpcodeStr,
"\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
- []>, VEX, VEX_L;
+ [(set (VTy RC256:$dst), RC256:$mask_wb,
+ (GatherNode256 (VTy RC256:$src1), RC256:$mask,
+ vectoraddr:$src2))]>, VEX, VEX_L;
}
-let mayLoad = 1, hasSideEffects = 0, Constraints
- = "@earlyclobber $dst, at earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
- in {
- defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx128mem, vx256mem>, VEX_W;
- defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx128mem, vy256mem>, VEX_W;
- defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx128mem, vy256mem>;
- defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx64mem, vy128mem>;
+let Predicates = [UseAVX2] in {
+ let mayLoad = 1, hasSideEffects = 0, Constraints
+ = "@earlyclobber $dst, at earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
+ in {
+ defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, avx2_mvpgatherdq_pd_xmm,
+ avx2_mvpgatherdq_pd_ymm, VR256, vx128mem, vx256mem>, VEX_W;
+ defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, avx2_mvpgatherqq_pd_xmm,
+ avx2_mvpgatherqq_pd_ymm, VR256, vx128mem, vy256mem>, VEX_W;
+ defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, avx2_mvpgatherdd_ps_xmm,
+ avx2_mvpgatherdd_ps_ymm, VR256, vx128mem, vy256mem>;
+ defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, avx2_mvpgatherqd_ps_xmm,
+ avx2_mvpgatherqd_ps_ymm, VR128, vx64mem, vy128mem>;
- let ExeDomain = SSEPackedDouble in {
- defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx128mem, vx256mem>, VEX_W;
- defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx128mem, vy256mem>, VEX_W;
- }
+ let ExeDomain = SSEPackedDouble in {
+ defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, avx2_mvpgatherdq_pd_xmm,
+ avx2_mvpgatherdq_pd_ymm, VR256, vx128mem, vx256mem>, VEX_W;
+ defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, avx2_mvpgatherqq_pd_xmm,
+ avx2_mvpgatherqq_pd_ymm, VR256, vx128mem, vy256mem>, VEX_W;
+ }
- let ExeDomain = SSEPackedSingle in {
- defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx128mem, vy256mem>;
- defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx64mem, vy128mem>;
+ let ExeDomain = SSEPackedSingle in {
+ defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, avx2_mvpgatherdd_ps_xmm,
+ avx2_mvpgatherdd_ps_ymm, VR256, vx128mem, vy256mem>;
+ defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, avx2_mvpgatherqd_ps_xmm,
+ avx2_mvpgatherqd_ps_ymm, VR128, vx64mem, vy128mem>;
+ }
}
}
Modified: llvm/trunk/lib/Target/X86/X86Subtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.h?rev=318641&r1=318640&r2=318641&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.h (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.h Mon Nov 20 00:18:12 2017
@@ -51,17 +51,9 @@ enum Style {
} // end namespace PICStyles
class X86Subtarget final : public X86GenSubtargetInfo {
-protected:
- enum X86SSEEnum {
- NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
- };
-
- enum X863DNowEnum {
- NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
- };
-
+public:
enum X86ProcFamilyEnum {
- Others,
+ Others,
IntelAtom,
IntelSLM,
IntelGLM,
@@ -74,6 +66,15 @@ protected:
IntelIcelake,
};
+protected:
+ enum X86SSEEnum {
+ NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
+ };
+
+ enum X863DNowEnum {
+ NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
+ };
+
/// X86 processor family: Intel Atom, and others
X86ProcFamilyEnum X86ProcFamily;
Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=318641&r1=318640&r2=318641&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Mon Nov 20 00:18:12 2017
@@ -2368,8 +2368,9 @@ int X86TTIImpl::getGSVectorCost(unsigned
// Trying to reduce IndexSize to 32 bits for vector 16.
// By default the IndexSize is equal to pointer size.
- unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
- DL.getPointerSizeInBits();
+ unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
+ ? getIndexSizeInBits(Ptr, DL)
+ : DL.getPointerSizeInBits();
Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
IndexSize), VF);
@@ -2385,7 +2386,9 @@ int X86TTIImpl::getGSVectorCost(unsigned
// The gather / scatter cost is given by Intel architects. It is a rough
// number since we are looking at one instruction in a time.
- const int GSOverhead = 2;
+ const int GSOverhead = (Opcode == Instruction::Load)
+ ? ST->getGatherOverhead()
+ : ST->getScatterOverhead();
return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
Alignment, AddressSpace);
}
@@ -2456,7 +2459,7 @@ int X86TTIImpl::getGatherScatterOpCost(u
// the mask vector will add more instructions. Right now we give the scalar
// cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
// is better in the VariableMask case.
- if (VF == 2 || (VF == 4 && !ST->hasVLX()))
+ if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
Scalarize = true;
if (Scalarize)
@@ -2515,11 +2518,15 @@ bool X86TTIImpl::isLegalMaskedGather(Typ
int DataWidth = isa<PointerType>(ScalarTy) ?
DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
- // AVX-512 allows gather and scatter
- return (DataWidth == 32 || DataWidth == 64) && ST->hasAVX512();
+ // AVX-512 and Skylake AVX2 allows gather and scatter
+ return (DataWidth == 32 || DataWidth == 64) && (ST->hasAVX512() ||
+ ST->getProcFamily() == X86Subtarget::IntelSkylake);
}
bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
+ // AVX2 doesn't support scatter
+ if (!ST->hasAVX512())
+ return false;
return isLegalMaskedGather(DataType);
}
Modified: llvm/trunk/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll?rev=318641&r1=318640&r2=318641&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll Mon Nov 20 00:18:12 2017
@@ -1,4 +1,5 @@
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s | FileCheck %s --check-prefix=AVX2
+; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=skylake -cost-model -analyze < %s | FileCheck %s --check-prefix=SKL
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=knl -cost-model -analyze < %s | FileCheck %s --check-prefix=KNL
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=skx -cost-model -analyze < %s | FileCheck %s --check-prefix=SKX
@@ -72,6 +73,9 @@ define <2 x double> @test_gather_2f64(<2
; AVX2-LABEL: test_gather_2f64
; AVX2: Found an estimated cost of 7 {{.*}}.gather
+; SKL-LABEL: test_gather_2f64
+; SKL: Found an estimated cost of 4 {{.*}}.gather
+
; KNL-LABEL: test_gather_2f64
; KNL: Found an estimated cost of 7 {{.*}}.gather
@@ -88,6 +92,9 @@ define <4 x i32> @test_gather_4i32(<4 x
; AVX2-LABEL: test_gather_4i32
; AVX2: Found an estimated cost of 16 {{.*}}.gather
+; SKL-LABEL: test_gather_4i32
+; SKL: Found an estimated cost of 6 {{.*}}.gather
+
; KNL-LABEL: test_gather_4i32
; KNL: Found an estimated cost of 16 {{.*}}.gather
@@ -103,6 +110,9 @@ define <4 x i32> @test_gather_4i32_const
; AVX2-LABEL: test_gather_4i32_const_mask
; AVX2: Found an estimated cost of 8 {{.*}}.gather
+; SKL-LABEL: test_gather_4i32_const_mask
+; SKL: Found an estimated cost of 6 {{.*}}.gather
+
; KNL-LABEL: test_gather_4i32_const_mask
; KNL: Found an estimated cost of 8 {{.*}}.gather
@@ -119,6 +129,9 @@ define <16 x float> @test_gather_16f32_c
; AVX2-LABEL: test_gather_16f32_const_mask
; AVX2: Found an estimated cost of 30 {{.*}}.gather
+; SKL-LABEL: test_gather_16f32_const_mask
+; SKL: Found an estimated cost of 24 {{.*}}.gather
+
; KNL-LABEL: test_gather_16f32_const_mask
; KNL: Found an estimated cost of 18 {{.*}}.gather
@@ -137,6 +150,9 @@ define <16 x float> @test_gather_16f32_v
; AVX2-LABEL: test_gather_16f32_var_mask
; AVX2: Found an estimated cost of 62 {{.*}}.gather
+; SKL-LABEL: test_gather_16f32_var_mask
+; SKL: Found an estimated cost of 24 {{.*}}.gather
+
; KNL-LABEL: test_gather_16f32_var_mask
; KNL: Found an estimated cost of 18 {{.*}}.gather
@@ -155,6 +171,9 @@ define <16 x float> @test_gather_16f32_r
; AVX2-LABEL: test_gather_16f32_ra_var_mask
; AVX2: Found an estimated cost of 62 {{.*}}.gather
+; SKL-LABEL: test_gather_16f32_ra_var_mask
+; SKL: Found an estimated cost of 24 {{.*}}.gather
+
; KNL-LABEL: test_gather_16f32_ra_var_mask
; KNL: Found an estimated cost of 20 {{.*}}.gather
@@ -173,6 +192,9 @@ define <16 x float> @test_gather_16f32_c
; AVX2-LABEL: test_gather_16f32_const_mask2
; AVX2: Found an estimated cost of 30 {{.*}}.gather
+; SKL-LABEL: test_gather_16f32_const_mask2
+; SKL: Found an estimated cost of 24 {{.*}}.gather
+
; KNL-LABEL: test_gather_16f32_const_mask2
; KNL: Found an estimated cost of 18 {{.*}}.gather
@@ -193,6 +215,9 @@ define void @test_scatter_16i32(i32* %ba
; AVX2-LABEL: test_scatter_16i32
; AVX2: Found an estimated cost of 64 {{.*}}.scatter
+; SKL-LABEL: test_scatter_16i32
+; SKL: Found an estimated cost of 64 {{.*}}.scatter
+
; KNL-LABEL: test_scatter_16i32
; KNL: Found an estimated cost of 18 {{.*}}.scatter
@@ -212,6 +237,9 @@ define void @test_scatter_8i32(<8 x i32>
; AVX2-LABEL: test_scatter_8i32
; AVX2: Found an estimated cost of 32 {{.*}}.scatter
+; SKL-LABEL: test_scatter_8i32
+; SKL: Found an estimated cost of 32 {{.*}}.scatter
+
; KNL-LABEL: test_scatter_8i32
; KNL: Found an estimated cost of 10 {{.*}}.scatter
@@ -228,6 +256,9 @@ define void @test_scatter_4i32(<4 x i32>
; AVX2-LABEL: test_scatter_4i32
; AVX2: Found an estimated cost of 16 {{.*}}.scatter
+; SKL-LABEL: test_scatter_4i32
+; SKL: Found an estimated cost of 16 {{.*}}.scatter
+
; KNL-LABEL: test_scatter_4i32
; KNL: Found an estimated cost of 16 {{.*}}.scatter
@@ -243,6 +274,9 @@ define <4 x float> @test_gather_4f32(flo
; AVX2-LABEL: test_gather_4f32
; AVX2: Found an estimated cost of 15 {{.*}}.gather
+; SKL-LABEL: test_gather_4f32
+; SKL: Found an estimated cost of 6 {{.*}}.gather
+
; KNL-LABEL: test_gather_4f32
; KNL: Found an estimated cost of 15 {{.*}}.gather
@@ -261,6 +295,9 @@ define <4 x float> @test_gather_4f32_con
; AVX2-LABEL: test_gather_4f32_const_mask
; AVX2: Found an estimated cost of 7 {{.*}}.gather
+; SKL-LABEL: test_gather_4f32_const_mask
+; SKL: Found an estimated cost of 6 {{.*}}.gather
+
; KNL-LABEL: test_gather_4f32_const_mask
; KNL: Found an estimated cost of 7 {{.*}}.gather
Modified: llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll?rev=318641&r1=318640&r2=318641&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll Mon Nov 20 00:18:12 2017
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx2 | FileCheck --check-prefix=X86 %s
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck --check-prefix=X64 %s
+; RUN: llc < %s -mcpu=skylake -mtriple=i386-unknown-linux-gnu -mattr=+avx2 | FileCheck --check-prefix=X86 %s
+; RUN: llc < %s -mcpu=skylake -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck --check-prefix=X64 %s
declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ptrs, i32 %align, <2 x i1> %masks, <2 x i32> %passthro)
@@ -8,47 +8,20 @@ define <2 x i32> @masked_gather_v2i32(<2
; X86-LABEL: masked_gather_v2i32:
; X86: # BB#0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero
-; X86-NEXT: vpextrb $0, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: # implicit-def: %XMM2
-; X86-NEXT: je .LBB0_2
-; X86-NEXT: # BB#1: # %cond.load
-; X86-NEXT: vmovd %xmm3, %eax
-; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-NEXT: .LBB0_2: # %else
-; X86-NEXT: vpextrb $8, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB0_4
-; X86-NEXT: # BB#3: # %cond.load1
-; X86-NEXT: vpextrd $2, %xmm3, %eax
-; X86-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2
-; X86-NEXT: .LBB0_4: # %else2
-; X86-NEXT: vpsllq $63, %xmm0, %xmm0
-; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
+; X86-NEXT: vpmovsxdq %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: masked_gather_v2i32:
; X64: # BB#0: # %entry
-; X64-NEXT: vmovdqa (%rdi), %xmm3
-; X64-NEXT: vpextrb $0, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: # implicit-def: %XMM2
-; X64-NEXT: je .LBB0_2
-; X64-NEXT: # BB#1: # %cond.load
-; X64-NEXT: vmovq %xmm3, %rax
-; X64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT: .LBB0_2: # %else
-; X64-NEXT: vpextrb $8, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB0_4
-; X64-NEXT: # BB#3: # %cond.load1
-; X64-NEXT: vpextrq $1, %xmm3, %rax
-; X64-NEXT: movl (%rax), %eax
-; X64-NEXT: vpinsrq $1, %rax, %xmm2, %xmm2
-; X64-NEXT: .LBB0_4: # %else2
-; X64-NEXT: vpsllq $63, %xmm0, %xmm0
-; X64-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; X64-NEXT: vmovdqa (%rdi), %xmm2
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
+; X64-NEXT: vpmovsxdq %xmm1, %xmm0
; X64-NEXT: retq
entry:
%ld = load <2 x i32*>, <2 x i32*>* %ptr
@@ -56,54 +29,53 @@ entry:
ret <2 x i32> %res
}
+define <4 x i32> @masked_gather_v2i32_concat(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) {
+; X86-LABEL: masked_gather_v2i32_concat:
+; X86: # BB#0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
+; X86-NEXT: vpmovsxdq %xmm1, %xmm0
+; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: retl
+;
+; X64-LABEL: masked_gather_v2i32_concat:
+; X64: # BB#0: # %entry
+; X64-NEXT: vmovdqa (%rdi), %xmm2
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
+; X64-NEXT: vpmovsxdq %xmm1, %xmm0
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: retq
+entry:
+ %ld = load <2 x i32*>, <2 x i32*>* %ptr
+ %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ld, i32 0, <2 x i1> %masks, <2 x i32> %passthro)
+ %res2 = shufflevector <2 x i32> %res, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %res2
+}
+
declare <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ptrs, i32 %align, <2 x i1> %masks, <2 x float> %passthro)
define <2 x float> @masked_gather_v2float(<2 x float*>* %ptr, <2 x i1> %masks, <2 x float> %passthro) {
; X86-LABEL: masked_gather_v2float:
; X86: # BB#0: # %entry
+; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero
-; X86-NEXT: vpextrb $0, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: # implicit-def: %XMM2
-; X86-NEXT: je .LBB1_2
-; X86-NEXT: # BB#1: # %cond.load
-; X86-NEXT: vmovd %xmm3, %eax
-; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-NEXT: .LBB1_2: # %else
-; X86-NEXT: vpextrb $8, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB1_4
-; X86-NEXT: # BB#3: # %cond.load1
-; X86-NEXT: vpextrd $2, %xmm3, %eax
-; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
-; X86-NEXT: .LBB1_4: # %else2
-; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: vpslld $31, %xmm0, %xmm0
-; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT: vgatherdps %xmm0, (,%xmm2), %xmm1
+; X86-NEXT: vmovaps %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: masked_gather_v2float:
; X64: # BB#0: # %entry
-; X64-NEXT: vmovdqa (%rdi), %xmm3
-; X64-NEXT: vpextrb $0, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: # implicit-def: %XMM2
-; X64-NEXT: je .LBB1_2
-; X64-NEXT: # BB#1: # %cond.load
-; X64-NEXT: vmovq %xmm3, %rax
-; X64-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT: .LBB1_2: # %else
-; X64-NEXT: vpextrb $8, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB1_4
-; X64-NEXT: # BB#3: # %cond.load1
-; X64-NEXT: vpextrq $1, %xmm3, %rax
-; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
-; X64-NEXT: .LBB1_4: # %else2
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT: vpslld $31, %xmm0, %xmm0
-; X64-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X64-NEXT: vmovaps (%rdi), %xmm2
+; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1
+; X64-NEXT: vmovaps %xmm1, %xmm0
+; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%ld = load <2 x float*>, <2 x float*>* %ptr
@@ -111,79 +83,45 @@ entry:
ret <2 x float> %res
}
+define <4 x float> @masked_gather_v2float_concat(<2 x float*>* %ptr, <2 x i1> %masks, <2 x float> %passthro) {
+; X86-LABEL: masked_gather_v2float_concat:
+; X86: # BB#0: # %entry
+; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT: vgatherdps %xmm0, (,%xmm2), %xmm1
+; X86-NEXT: vmovaps %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: masked_gather_v2float_concat:
+; X64: # BB#0: # %entry
+; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X64-NEXT: vmovaps (%rdi), %xmm2
+; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1
+; X64-NEXT: vmovaps %xmm1, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %ld = load <2 x float*>, <2 x float*>* %ptr
+ %res = call <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ld, i32 0, <2 x i1> %masks, <2 x float> %passthro)
+ %res2 = shufflevector <2 x float> %res, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x float> %res2
+}
+
+
declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 %align, <4 x i1> %masks, <4 x i32> %passthro)
define <4 x i32> @masked_gather_v4i32(<4 x i32*> %ptrs, <4 x i1> %masks, <4 x i32> %passthro) {
; X86-LABEL: masked_gather_v4i32:
; X86: # BB#0: # %entry
-; X86-NEXT: vpextrb $0, %xmm1, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: # implicit-def: %XMM3
-; X86-NEXT: je .LBB2_2
-; X86-NEXT: # BB#1: # %cond.load
-; X86-NEXT: vmovd %xmm0, %eax
-; X86-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X86-NEXT: .LBB2_2: # %else
-; X86-NEXT: vpextrb $4, %xmm1, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB2_4
-; X86-NEXT: # BB#3: # %cond.load1
-; X86-NEXT: vpextrd $1, %xmm0, %eax
-; X86-NEXT: vpinsrd $1, (%eax), %xmm3, %xmm3
-; X86-NEXT: .LBB2_4: # %else2
-; X86-NEXT: vpextrb $8, %xmm1, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB2_6
-; X86-NEXT: # BB#5: # %cond.load4
-; X86-NEXT: vpextrd $2, %xmm0, %eax
-; X86-NEXT: vpinsrd $2, (%eax), %xmm3, %xmm3
-; X86-NEXT: .LBB2_6: # %else5
-; X86-NEXT: vpextrb $12, %xmm1, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB2_8
-; X86-NEXT: # BB#7: # %cond.load7
-; X86-NEXT: vpextrd $3, %xmm0, %eax
-; X86-NEXT: vpinsrd $3, (%eax), %xmm3, %xmm3
-; X86-NEXT: .LBB2_8: # %else8
-; X86-NEXT: vpslld $31, %xmm1, %xmm0
-; X86-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; X86-NEXT: vpgatherdd %xmm1, (,%xmm0), %xmm2
+; X86-NEXT: vmovdqa %xmm2, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: masked_gather_v4i32:
; X64: # BB#0: # %entry
-; X64-NEXT: vpextrb $0, %xmm1, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: # implicit-def: %XMM3
-; X64-NEXT: je .LBB2_2
-; X64-NEXT: # BB#1: # %cond.load
-; X64-NEXT: vmovq %xmm0, %rax
-; X64-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X64-NEXT: .LBB2_2: # %else
-; X64-NEXT: vpextrb $4, %xmm1, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB2_4
-; X64-NEXT: # BB#3: # %cond.load1
-; X64-NEXT: vpextrq $1, %xmm0, %rax
-; X64-NEXT: vpinsrd $1, (%rax), %xmm3, %xmm3
-; X64-NEXT: .LBB2_4: # %else2
-; X64-NEXT: vpextrb $8, %xmm1, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB2_6
-; X64-NEXT: # BB#5: # %cond.load4
-; X64-NEXT: vextracti128 $1, %ymm0, %xmm4
-; X64-NEXT: vmovq %xmm4, %rax
-; X64-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3
-; X64-NEXT: .LBB2_6: # %else5
-; X64-NEXT: vpextrb $12, %xmm1, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB2_8
-; X64-NEXT: # BB#7: # %cond.load7
-; X64-NEXT: vextracti128 $1, %ymm0, %xmm0
-; X64-NEXT: vpextrq $1, %xmm0, %rax
-; X64-NEXT: vpinsrd $3, (%rax), %xmm3, %xmm3
-; X64-NEXT: .LBB2_8: # %else8
-; X64-NEXT: vpslld $31, %xmm1, %xmm0
-; X64-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; X64-NEXT: vpgatherqd %xmm1, (,%ymm0), %xmm2
+; X64-NEXT: vmovdqa %xmm2, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
@@ -196,74 +134,14 @@ declare <4 x float> @llvm.masked.gather.
define <4 x float> @masked_gather_v4float(<4 x float*> %ptrs, <4 x i1> %masks, <4 x float> %passthro) {
; X86-LABEL: masked_gather_v4float:
; X86: # BB#0: # %entry
-; X86-NEXT: vpextrb $0, %xmm1, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: # implicit-def: %XMM3
-; X86-NEXT: je .LBB3_2
-; X86-NEXT: # BB#1: # %cond.load
-; X86-NEXT: vmovd %xmm0, %eax
-; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X86-NEXT: .LBB3_2: # %else
-; X86-NEXT: vpextrb $4, %xmm1, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB3_4
-; X86-NEXT: # BB#3: # %cond.load1
-; X86-NEXT: vpextrd $1, %xmm0, %eax
-; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
-; X86-NEXT: .LBB3_4: # %else2
-; X86-NEXT: vpextrb $8, %xmm1, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB3_6
-; X86-NEXT: # BB#5: # %cond.load4
-; X86-NEXT: vpextrd $2, %xmm0, %eax
-; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
-; X86-NEXT: .LBB3_6: # %else5
-; X86-NEXT: vpextrb $12, %xmm1, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB3_8
-; X86-NEXT: # BB#7: # %cond.load7
-; X86-NEXT: vpextrd $3, %xmm0, %eax
-; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
-; X86-NEXT: .LBB3_8: # %else8
-; X86-NEXT: vpslld $31, %xmm1, %xmm0
-; X86-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; X86-NEXT: vgatherdps %xmm1, (,%xmm0), %xmm2
+; X86-NEXT: vmovaps %xmm2, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: masked_gather_v4float:
; X64: # BB#0: # %entry
-; X64-NEXT: vpextrb $0, %xmm1, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: # implicit-def: %XMM3
-; X64-NEXT: je .LBB3_2
-; X64-NEXT: # BB#1: # %cond.load
-; X64-NEXT: vmovq %xmm0, %rax
-; X64-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X64-NEXT: .LBB3_2: # %else
-; X64-NEXT: vpextrb $4, %xmm1, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB3_4
-; X64-NEXT: # BB#3: # %cond.load1
-; X64-NEXT: vpextrq $1, %xmm0, %rax
-; X64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
-; X64-NEXT: .LBB3_4: # %else2
-; X64-NEXT: vpextrb $8, %xmm1, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB3_6
-; X64-NEXT: # BB#5: # %cond.load4
-; X64-NEXT: vextracti128 $1, %ymm0, %xmm4
-; X64-NEXT: vmovq %xmm4, %rax
-; X64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
-; X64-NEXT: .LBB3_6: # %else5
-; X64-NEXT: vpextrb $12, %xmm1, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB3_8
-; X64-NEXT: # BB#7: # %cond.load7
-; X64-NEXT: vextracti128 $1, %ymm0, %xmm0
-; X64-NEXT: vpextrq $1, %xmm0, %rax
-; X64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
-; X64-NEXT: .LBB3_8: # %else8
-; X64-NEXT: vpslld $31, %xmm1, %xmm0
-; X64-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; X64-NEXT: vgatherqps %xmm1, (,%ymm0), %xmm2
+; X64-NEXT: vmovaps %xmm2, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
@@ -276,164 +154,25 @@ declare <8 x i32> @llvm.masked.gather.v8
define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i32> %passthro) {
; X86-LABEL: masked_gather_v8i32:
; X86: # BB#0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: vmovdqa (%eax), %ymm3
-; X86-NEXT: vpextrb $0, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: # implicit-def: %YMM2
-; X86-NEXT: je .LBB4_2
-; X86-NEXT: # BB#1: # %cond.load
-; X86-NEXT: vmovd %xmm3, %eax
-; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-NEXT: .LBB4_2: # %else
-; X86-NEXT: vpextrb $2, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB4_4
-; X86-NEXT: # BB#3: # %cond.load1
-; X86-NEXT: vpextrd $1, %xmm3, %eax
-; X86-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm4
-; X86-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; X86-NEXT: .LBB4_4: # %else2
-; X86-NEXT: vpextrb $4, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB4_6
-; X86-NEXT: # BB#5: # %cond.load4
-; X86-NEXT: vpextrd $2, %xmm3, %eax
-; X86-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm4
-; X86-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; X86-NEXT: .LBB4_6: # %else5
-; X86-NEXT: vpextrb $6, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB4_8
-; X86-NEXT: # BB#7: # %cond.load7
-; X86-NEXT: vpextrd $3, %xmm3, %eax
-; X86-NEXT: vpinsrd $3, (%eax), %xmm2, %xmm4
-; X86-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; X86-NEXT: .LBB4_8: # %else8
-; X86-NEXT: vpextrb $8, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB4_10
-; X86-NEXT: # BB#9: # %cond.load10
-; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
-; X86-NEXT: vmovd %xmm4, %eax
-; X86-NEXT: vextracti128 $1, %ymm2, %xmm4
-; X86-NEXT: vpinsrd $0, (%eax), %xmm4, %xmm4
-; X86-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
-; X86-NEXT: .LBB4_10: # %else11
-; X86-NEXT: vpextrb $10, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB4_12
-; X86-NEXT: # BB#11: # %cond.load13
-; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
-; X86-NEXT: vpextrd $1, %xmm4, %eax
-; X86-NEXT: vextracti128 $1, %ymm2, %xmm4
-; X86-NEXT: vpinsrd $1, (%eax), %xmm4, %xmm4
-; X86-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
-; X86-NEXT: .LBB4_12: # %else14
-; X86-NEXT: vpextrb $12, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB4_14
-; X86-NEXT: # BB#13: # %cond.load16
-; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
-; X86-NEXT: vpextrd $2, %xmm4, %eax
-; X86-NEXT: vextracti128 $1, %ymm2, %xmm4
-; X86-NEXT: vpinsrd $2, (%eax), %xmm4, %xmm4
-; X86-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
-; X86-NEXT: .LBB4_14: # %else17
-; X86-NEXT: vpextrb $14, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB4_16
-; X86-NEXT: # BB#15: # %cond.load19
-; X86-NEXT: vextracti128 $1, %ymm3, %xmm3
-; X86-NEXT: vpextrd $3, %xmm3, %eax
-; X86-NEXT: vextracti128 $1, %ymm2, %xmm3
-; X86-NEXT: vpinsrd $3, (%eax), %xmm3, %xmm3
-; X86-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; X86-NEXT: .LBB4_16: # %else20
; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; X86-NEXT: vpslld $31, %ymm0, %ymm0
-; X86-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovdqa (%eax), %ymm2
+; X86-NEXT: vpgatherdd %ymm0, (,%ymm2), %ymm1
+; X86-NEXT: vmovdqa %ymm1, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: masked_gather_v8i32:
; X64: # BB#0: # %entry
-; X64-NEXT: vmovdqa (%rdi), %ymm4
-; X64-NEXT: vmovdqa 32(%rdi), %ymm3
-; X64-NEXT: vpextrb $0, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: # implicit-def: %YMM2
-; X64-NEXT: je .LBB4_2
-; X64-NEXT: # BB#1: # %cond.load
-; X64-NEXT: vmovq %xmm4, %rax
-; X64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT: .LBB4_2: # %else
-; X64-NEXT: vpextrb $2, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB4_4
-; X64-NEXT: # BB#3: # %cond.load1
-; X64-NEXT: vpextrq $1, %xmm4, %rax
-; X64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm5
-; X64-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
-; X64-NEXT: .LBB4_4: # %else2
-; X64-NEXT: vpextrb $4, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB4_6
-; X64-NEXT: # BB#5: # %cond.load4
-; X64-NEXT: vextracti128 $1, %ymm4, %xmm5
-; X64-NEXT: vmovq %xmm5, %rax
-; X64-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm5
-; X64-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
-; X64-NEXT: .LBB4_6: # %else5
-; X64-NEXT: vpextrb $6, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB4_8
-; X64-NEXT: # BB#7: # %cond.load7
-; X64-NEXT: vextracti128 $1, %ymm4, %xmm4
-; X64-NEXT: vpextrq $1, %xmm4, %rax
-; X64-NEXT: vpinsrd $3, (%rax), %xmm2, %xmm4
-; X64-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; X64-NEXT: .LBB4_8: # %else8
-; X64-NEXT: vpextrb $8, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB4_10
-; X64-NEXT: # BB#9: # %cond.load10
-; X64-NEXT: vmovq %xmm3, %rax
-; X64-NEXT: vextracti128 $1, %ymm2, %xmm4
-; X64-NEXT: vpinsrd $0, (%rax), %xmm4, %xmm4
-; X64-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
-; X64-NEXT: .LBB4_10: # %else11
-; X64-NEXT: vpextrb $10, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB4_12
-; X64-NEXT: # BB#11: # %cond.load13
-; X64-NEXT: vpextrq $1, %xmm3, %rax
-; X64-NEXT: vextracti128 $1, %ymm2, %xmm4
-; X64-NEXT: vpinsrd $1, (%rax), %xmm4, %xmm4
-; X64-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
-; X64-NEXT: .LBB4_12: # %else14
-; X64-NEXT: vpextrb $12, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB4_14
-; X64-NEXT: # BB#13: # %cond.load16
-; X64-NEXT: vextracti128 $1, %ymm3, %xmm4
-; X64-NEXT: vmovq %xmm4, %rax
-; X64-NEXT: vextracti128 $1, %ymm2, %xmm4
-; X64-NEXT: vpinsrd $2, (%rax), %xmm4, %xmm4
-; X64-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
-; X64-NEXT: .LBB4_14: # %else17
-; X64-NEXT: vpextrb $14, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB4_16
-; X64-NEXT: # BB#15: # %cond.load19
-; X64-NEXT: vextracti128 $1, %ymm3, %xmm3
-; X64-NEXT: vpextrq $1, %xmm3, %rax
-; X64-NEXT: vextracti128 $1, %ymm2, %xmm3
-; X64-NEXT: vpinsrd $3, (%rax), %xmm3, %xmm3
-; X64-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; X64-NEXT: .LBB4_16: # %else20
; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X64-NEXT: vpslld $31, %ymm0, %ymm0
-; X64-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; X64-NEXT: vpsrad $31, %ymm0, %ymm0
+; X64-NEXT: vmovdqa (%rdi), %ymm2
+; X64-NEXT: vmovdqa 32(%rdi), %ymm3
+; X64-NEXT: vextracti128 $1, %ymm1, %xmm4
+; X64-NEXT: vextracti128 $1, %ymm0, %xmm5
+; X64-NEXT: vpgatherqd %xmm5, (,%ymm3), %xmm4
+; X64-NEXT: vpgatherqd %xmm0, (,%ymm2), %xmm1
+; X64-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm0
; X64-NEXT: retq
entry:
%ld = load <8 x i32*>, <8 x i32*>* %ptr
@@ -446,166 +185,25 @@ declare <8 x float> @llvm.masked.gather.
define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <8 x float> %passthro) {
; X86-LABEL: masked_gather_v8float:
; X86: # BB#0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: vmovdqa (%eax), %ymm3
-; X86-NEXT: vpextrb $0, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: # implicit-def: %YMM2
-; X86-NEXT: je .LBB5_2
-; X86-NEXT: # BB#1: # %cond.load
-; X86-NEXT: vmovd %xmm3, %eax
-; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-NEXT: .LBB5_2: # %else
-; X86-NEXT: vpextrb $2, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB5_4
-; X86-NEXT: # BB#3: # %cond.load1
-; X86-NEXT: vpextrd $1, %xmm3, %eax
-; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0],mem[0],xmm2[2,3]
-; X86-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; X86-NEXT: .LBB5_4: # %else2
-; X86-NEXT: vpextrb $4, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB5_6
-; X86-NEXT: # BB#5: # %cond.load4
-; X86-NEXT: vpextrd $2, %xmm3, %eax
-; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1],mem[0],xmm2[3]
-; X86-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; X86-NEXT: .LBB5_6: # %else5
-; X86-NEXT: vpextrb $6, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB5_8
-; X86-NEXT: # BB#7: # %cond.load7
-; X86-NEXT: vpextrd $3, %xmm3, %eax
-; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1,2],mem[0]
-; X86-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; X86-NEXT: .LBB5_8: # %else8
-; X86-NEXT: vpextrb $8, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB5_10
-; X86-NEXT: # BB#9: # %cond.load10
-; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
-; X86-NEXT: vmovd %xmm4, %eax
-; X86-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; X86-NEXT: vextractf128 $1, %ymm2, %xmm5
-; X86-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
-; X86-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; X86-NEXT: .LBB5_10: # %else11
-; X86-NEXT: vpextrb $10, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB5_12
-; X86-NEXT: # BB#11: # %cond.load13
-; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
-; X86-NEXT: vpextrd $1, %xmm4, %eax
-; X86-NEXT: vextractf128 $1, %ymm2, %xmm4
-; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; X86-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; X86-NEXT: .LBB5_12: # %else14
-; X86-NEXT: vpextrb $12, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB5_14
-; X86-NEXT: # BB#13: # %cond.load16
-; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
-; X86-NEXT: vpextrd $2, %xmm4, %eax
-; X86-NEXT: vextractf128 $1, %ymm2, %xmm4
-; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; X86-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; X86-NEXT: .LBB5_14: # %else17
-; X86-NEXT: vpextrb $14, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB5_16
-; X86-NEXT: # BB#15: # %cond.load19
-; X86-NEXT: vextracti128 $1, %ymm3, %xmm3
-; X86-NEXT: vpextrd $3, %xmm3, %eax
-; X86-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
-; X86-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; X86-NEXT: .LBB5_16: # %else20
; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; X86-NEXT: vpslld $31, %ymm0, %ymm0
-; X86-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovaps (%eax), %ymm2
+; X86-NEXT: vgatherdps %ymm0, (,%ymm2), %ymm1
+; X86-NEXT: vmovaps %ymm1, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: masked_gather_v8float:
; X64: # BB#0: # %entry
-; X64-NEXT: vmovdqa (%rdi), %ymm4
-; X64-NEXT: vmovdqa 32(%rdi), %ymm3
-; X64-NEXT: vpextrb $0, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: # implicit-def: %YMM2
-; X64-NEXT: je .LBB5_2
-; X64-NEXT: # BB#1: # %cond.load
-; X64-NEXT: vmovq %xmm4, %rax
-; X64-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT: .LBB5_2: # %else
-; X64-NEXT: vpextrb $2, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB5_4
-; X64-NEXT: # BB#3: # %cond.load1
-; X64-NEXT: vpextrq $1, %xmm4, %rax
-; X64-NEXT: vinsertps {{.*#+}} xmm5 = xmm2[0],mem[0],xmm2[2,3]
-; X64-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
-; X64-NEXT: .LBB5_4: # %else2
-; X64-NEXT: vpextrb $4, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB5_6
-; X64-NEXT: # BB#5: # %cond.load4
-; X64-NEXT: vextracti128 $1, %ymm4, %xmm5
-; X64-NEXT: vmovq %xmm5, %rax
-; X64-NEXT: vinsertps {{.*#+}} xmm5 = xmm2[0,1],mem[0],xmm2[3]
-; X64-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
-; X64-NEXT: .LBB5_6: # %else5
-; X64-NEXT: vpextrb $6, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB5_8
-; X64-NEXT: # BB#7: # %cond.load7
-; X64-NEXT: vextracti128 $1, %ymm4, %xmm4
-; X64-NEXT: vpextrq $1, %xmm4, %rax
-; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1,2],mem[0]
-; X64-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; X64-NEXT: .LBB5_8: # %else8
-; X64-NEXT: vpextrb $8, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB5_10
-; X64-NEXT: # BB#9: # %cond.load10
-; X64-NEXT: vmovq %xmm3, %rax
-; X64-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; X64-NEXT: vextractf128 $1, %ymm2, %xmm5
-; X64-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
-; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; X64-NEXT: .LBB5_10: # %else11
-; X64-NEXT: vpextrb $10, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB5_12
-; X64-NEXT: # BB#11: # %cond.load13
-; X64-NEXT: vpextrq $1, %xmm3, %rax
-; X64-NEXT: vextractf128 $1, %ymm2, %xmm4
-; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; X64-NEXT: .LBB5_12: # %else14
-; X64-NEXT: vpextrb $12, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB5_14
-; X64-NEXT: # BB#13: # %cond.load16
-; X64-NEXT: vextracti128 $1, %ymm3, %xmm4
-; X64-NEXT: vmovq %xmm4, %rax
-; X64-NEXT: vextractf128 $1, %ymm2, %xmm4
-; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; X64-NEXT: .LBB5_14: # %else17
-; X64-NEXT: vpextrb $14, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB5_16
-; X64-NEXT: # BB#15: # %cond.load19
-; X64-NEXT: vextracti128 $1, %ymm3, %xmm3
-; X64-NEXT: vpextrq $1, %xmm3, %rax
-; X64-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
-; X64-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; X64-NEXT: .LBB5_16: # %else20
; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X64-NEXT: vpslld $31, %ymm0, %ymm0
-; X64-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; X64-NEXT: vpsrad $31, %ymm0, %ymm0
+; X64-NEXT: vmovaps (%rdi), %ymm2
+; X64-NEXT: vmovaps 32(%rdi), %ymm3
+; X64-NEXT: vextractf128 $1, %ymm1, %xmm4
+; X64-NEXT: vextracti128 $1, %ymm0, %xmm5
+; X64-NEXT: vgatherqps %xmm5, (,%ymm3), %xmm4
+; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm0
; X64-NEXT: retq
entry:
%ld = load <8 x float*>, <8 x float*>* %ptr
@@ -618,92 +216,23 @@ declare <4 x i64> @llvm.masked.gather.v4
define <4 x i64> @masked_gather_v4i64(<4 x i64*>* %ptr, <4 x i1> %masks, <4 x i64> %passthro) {
; X86-LABEL: masked_gather_v4i64:
; X86: # BB#0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: vmovdqa (%eax), %xmm3
-; X86-NEXT: vpextrb $0, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: # implicit-def: %YMM2
-; X86-NEXT: je .LBB6_2
-; X86-NEXT: # BB#1: # %cond.load
-; X86-NEXT: vmovd %xmm3, %eax
-; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; X86-NEXT: .LBB6_2: # %else
-; X86-NEXT: vpextrb $4, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB6_4
-; X86-NEXT: # BB#3: # %cond.load1
-; X86-NEXT: vpextrd $1, %xmm3, %eax
-; X86-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm4
-; X86-NEXT: vpinsrd $3, 4(%eax), %xmm4, %xmm4
-; X86-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; X86-NEXT: .LBB6_4: # %else2
-; X86-NEXT: vpextrb $8, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB6_6
-; X86-NEXT: # BB#5: # %cond.load4
-; X86-NEXT: vpextrd $2, %xmm3, %eax
-; X86-NEXT: vextracti128 $1, %ymm2, %xmm4
-; X86-NEXT: vpinsrd $0, (%eax), %xmm4, %xmm4
-; X86-NEXT: vpinsrd $1, 4(%eax), %xmm4, %xmm4
-; X86-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
-; X86-NEXT: .LBB6_6: # %else5
-; X86-NEXT: vpextrb $12, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB6_8
-; X86-NEXT: # BB#7: # %cond.load7
-; X86-NEXT: vpextrd $3, %xmm3, %eax
-; X86-NEXT: vextracti128 $1, %ymm2, %xmm3
-; X86-NEXT: vpinsrd $2, (%eax), %xmm3, %xmm3
-; X86-NEXT: vpinsrd $3, 4(%eax), %xmm3, %xmm3
-; X86-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; X86-NEXT: .LBB6_8: # %else8
; X86-NEXT: vpslld $31, %xmm0, %xmm0
+; X86-NEXT: vpsrad $31, %xmm0, %xmm0
; X86-NEXT: vpmovsxdq %xmm0, %ymm0
-; X86-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovdqa (%eax), %xmm2
+; X86-NEXT: vpgatherdq %ymm0, (,%xmm2), %ymm1
+; X86-NEXT: vmovdqa %ymm1, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: masked_gather_v4i64:
; X64: # BB#0: # %entry
-; X64-NEXT: vmovdqa (%rdi), %ymm3
-; X64-NEXT: vpextrb $0, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: # implicit-def: %YMM2
-; X64-NEXT: je .LBB6_2
-; X64-NEXT: # BB#1: # %cond.load
-; X64-NEXT: vmovq %xmm3, %rax
-; X64-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; X64-NEXT: .LBB6_2: # %else
-; X64-NEXT: vpextrb $4, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB6_4
-; X64-NEXT: # BB#3: # %cond.load1
-; X64-NEXT: vpextrq $1, %xmm3, %rax
-; X64-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm4
-; X64-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; X64-NEXT: .LBB6_4: # %else2
-; X64-NEXT: vpextrb $8, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB6_6
-; X64-NEXT: # BB#5: # %cond.load4
-; X64-NEXT: vextracti128 $1, %ymm3, %xmm4
-; X64-NEXT: vmovq %xmm4, %rax
-; X64-NEXT: vextracti128 $1, %ymm2, %xmm4
-; X64-NEXT: vpinsrq $0, (%rax), %xmm4, %xmm4
-; X64-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
-; X64-NEXT: .LBB6_6: # %else5
-; X64-NEXT: vpextrb $12, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB6_8
-; X64-NEXT: # BB#7: # %cond.load7
-; X64-NEXT: vextracti128 $1, %ymm3, %xmm3
-; X64-NEXT: vpextrq $1, %xmm3, %rax
-; X64-NEXT: vextracti128 $1, %ymm2, %xmm3
-; X64-NEXT: vpinsrq $1, (%rax), %xmm3, %xmm3
-; X64-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; X64-NEXT: .LBB6_8: # %else8
; X64-NEXT: vpslld $31, %xmm0, %xmm0
+; X64-NEXT: vpsrad $31, %xmm0, %xmm0
; X64-NEXT: vpmovsxdq %xmm0, %ymm0
-; X64-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; X64-NEXT: vmovdqa (%rdi), %ymm2
+; X64-NEXT: vpgatherqq %ymm0, (,%ymm2), %ymm1
+; X64-NEXT: vmovdqa %ymm1, %ymm0
; X64-NEXT: retq
entry:
%ld = load <4 x i64*>, <4 x i64*>* %ptr
@@ -716,89 +245,23 @@ declare <4 x double> @llvm.masked.gather
define <4 x double> @masked_gather_v4double(<4 x double*>* %ptr, <4 x i1> %masks, <4 x double> %passthro) {
; X86-LABEL: masked_gather_v4double:
; X86: # BB#0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: vmovdqa (%eax), %xmm3
-; X86-NEXT: vpextrb $0, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: # implicit-def: %YMM2
-; X86-NEXT: je .LBB7_2
-; X86-NEXT: # BB#1: # %cond.load
-; X86-NEXT: vmovd %xmm3, %eax
-; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; X86-NEXT: .LBB7_2: # %else
-; X86-NEXT: vpextrb $4, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB7_4
-; X86-NEXT: # BB#3: # %cond.load1
-; X86-NEXT: vpextrd $1, %xmm3, %eax
-; X86-NEXT: vmovhpd {{.*#+}} xmm4 = xmm2[0],mem[0]
-; X86-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
-; X86-NEXT: .LBB7_4: # %else2
-; X86-NEXT: vpextrb $8, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB7_6
-; X86-NEXT: # BB#5: # %cond.load4
-; X86-NEXT: vpextrd $2, %xmm3, %eax
-; X86-NEXT: vextractf128 $1, %ymm2, %xmm4
-; X86-NEXT: vmovlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
-; X86-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; X86-NEXT: .LBB7_6: # %else5
-; X86-NEXT: vpextrb $12, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB7_8
-; X86-NEXT: # BB#7: # %cond.load7
-; X86-NEXT: vpextrd $3, %xmm3, %eax
-; X86-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X86-NEXT: vmovhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
-; X86-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; X86-NEXT: .LBB7_8: # %else8
; X86-NEXT: vpslld $31, %xmm0, %xmm0
+; X86-NEXT: vpsrad $31, %xmm0, %xmm0
; X86-NEXT: vpmovsxdq %xmm0, %ymm0
-; X86-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovapd (%eax), %xmm2
+; X86-NEXT: vgatherdpd %ymm0, (,%xmm2), %ymm1
+; X86-NEXT: vmovapd %ymm1, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: masked_gather_v4double:
; X64: # BB#0: # %entry
-; X64-NEXT: vmovdqa (%rdi), %ymm3
-; X64-NEXT: vpextrb $0, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: # implicit-def: %YMM2
-; X64-NEXT: je .LBB7_2
-; X64-NEXT: # BB#1: # %cond.load
-; X64-NEXT: vmovq %xmm3, %rax
-; X64-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; X64-NEXT: .LBB7_2: # %else
-; X64-NEXT: vpextrb $4, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB7_4
-; X64-NEXT: # BB#3: # %cond.load1
-; X64-NEXT: vpextrq $1, %xmm3, %rax
-; X64-NEXT: vmovhpd {{.*#+}} xmm4 = xmm2[0],mem[0]
-; X64-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
-; X64-NEXT: .LBB7_4: # %else2
-; X64-NEXT: vpextrb $8, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB7_6
-; X64-NEXT: # BB#5: # %cond.load4
-; X64-NEXT: vextracti128 $1, %ymm3, %xmm4
-; X64-NEXT: vmovq %xmm4, %rax
-; X64-NEXT: vextractf128 $1, %ymm2, %xmm4
-; X64-NEXT: vmovlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
-; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; X64-NEXT: .LBB7_6: # %else5
-; X64-NEXT: vpextrb $12, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB7_8
-; X64-NEXT: # BB#7: # %cond.load7
-; X64-NEXT: vextracti128 $1, %ymm3, %xmm3
-; X64-NEXT: vpextrq $1, %xmm3, %rax
-; X64-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X64-NEXT: vmovhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
-; X64-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; X64-NEXT: .LBB7_8: # %else8
; X64-NEXT: vpslld $31, %xmm0, %xmm0
+; X64-NEXT: vpsrad $31, %xmm0, %xmm0
; X64-NEXT: vpmovsxdq %xmm0, %ymm0
-; X64-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; X64-NEXT: vmovapd (%rdi), %ymm2
+; X64-NEXT: vgatherqpd %ymm0, (,%ymm2), %ymm1
+; X64-NEXT: vmovapd %ymm1, %ymm0
; X64-NEXT: retq
entry:
%ld = load <4 x double*>, <4 x double*>* %ptr
@@ -812,47 +275,16 @@ define <2 x i64> @masked_gather_v2i64(<2
; X86-LABEL: masked_gather_v2i64:
; X86: # BB#0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero
-; X86-NEXT: vpextrb $0, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: # implicit-def: %XMM2
-; X86-NEXT: je .LBB8_2
-; X86-NEXT: # BB#1: # %cond.load
-; X86-NEXT: vmovd %xmm3, %eax
-; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; X86-NEXT: .LBB8_2: # %else
-; X86-NEXT: vpextrb $8, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB8_4
-; X86-NEXT: # BB#3: # %cond.load1
-; X86-NEXT: vpextrd $2, %xmm3, %eax
-; X86-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2
-; X86-NEXT: vpinsrd $3, 4(%eax), %xmm2, %xmm2
-; X86-NEXT: .LBB8_4: # %else2
-; X86-NEXT: vpsllq $63, %xmm0, %xmm0
-; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; X86-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1
+; X86-NEXT: vmovdqa %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: masked_gather_v2i64:
; X64: # BB#0: # %entry
-; X64-NEXT: vmovdqa (%rdi), %xmm3
-; X64-NEXT: vpextrb $0, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: # implicit-def: %XMM2
-; X64-NEXT: je .LBB8_2
-; X64-NEXT: # BB#1: # %cond.load
-; X64-NEXT: vmovq %xmm3, %rax
-; X64-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; X64-NEXT: .LBB8_2: # %else
-; X64-NEXT: vpextrb $8, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB8_4
-; X64-NEXT: # BB#3: # %cond.load1
-; X64-NEXT: vpextrq $1, %xmm3, %rax
-; X64-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm2
-; X64-NEXT: .LBB8_4: # %else2
-; X64-NEXT: vpsllq $63, %xmm0, %xmm0
-; X64-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; X64-NEXT: vmovdqa (%rdi), %xmm2
+; X64-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1
+; X64-NEXT: vmovdqa %xmm1, %xmm0
; X64-NEXT: retq
entry:
%ld = load <2 x i64*>, <2 x i64*>* %ptr
@@ -866,46 +298,16 @@ define <2 x double> @masked_gather_v2dou
; X86-LABEL: masked_gather_v2double:
; X86: # BB#0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero
-; X86-NEXT: vpextrb $0, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: # implicit-def: %XMM2
-; X86-NEXT: je .LBB9_2
-; X86-NEXT: # BB#1: # %cond.load
-; X86-NEXT: vmovd %xmm3, %eax
-; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; X86-NEXT: .LBB9_2: # %else
-; X86-NEXT: vpextrb $8, %xmm0, %eax
-; X86-NEXT: testb $1, %al
-; X86-NEXT: je .LBB9_4
-; X86-NEXT: # BB#3: # %cond.load1
-; X86-NEXT: vpextrd $2, %xmm3, %eax
-; X86-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
-; X86-NEXT: .LBB9_4: # %else2
-; X86-NEXT: vpsllq $63, %xmm0, %xmm0
-; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; X86-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1
+; X86-NEXT: vmovapd %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: masked_gather_v2double:
; X64: # BB#0: # %entry
-; X64-NEXT: vmovdqa (%rdi), %xmm3
-; X64-NEXT: vpextrb $0, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: # implicit-def: %XMM2
-; X64-NEXT: je .LBB9_2
-; X64-NEXT: # BB#1: # %cond.load
-; X64-NEXT: vmovq %xmm3, %rax
-; X64-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; X64-NEXT: .LBB9_2: # %else
-; X64-NEXT: vpextrb $8, %xmm0, %eax
-; X64-NEXT: testb $1, %al
-; X64-NEXT: je .LBB9_4
-; X64-NEXT: # BB#3: # %cond.load1
-; X64-NEXT: vpextrq $1, %xmm3, %rax
-; X64-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
-; X64-NEXT: .LBB9_4: # %else2
-; X64-NEXT: vpsllq $63, %xmm0, %xmm0
-; X64-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; X64-NEXT: vmovapd (%rdi), %xmm2
+; X64-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1
+; X64-NEXT: vmovapd %xmm1, %xmm0
; X64-NEXT: retq
entry:
%ld = load <2 x double*>, <2 x double*>* %ptr
More information about the llvm-commits
mailing list