[llvm] r255629 - Type legalizer for masked gather and scatter intrinsics.
Elena Demikhovsky via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 15 00:40:44 PST 2015
Author: delena
Date: Tue Dec 15 02:40:41 2015
New Revision: 255629
URL: http://llvm.org/viewvc/llvm-project?rev=255629&view=rev
Log:
Type legalizer for masked gather and scatter intrinsics.
Full type legalizer that works with all vectors length - from 2 to 16, (i32, i64, float, double).
This intrinsic, for example
void @llvm.masked.scatter.v2f32(<2 x float>%data , <2 x float*>%ptrs , i32 align , <2 x i1>%mask )
requires type widening for data and type promotion for mask.
Differential Revision: http://reviews.llvm.org/D13633
Modified:
llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h
llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h
llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/lib/Target/X86/X86InstrAVX512.td
llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
llvm/trunk/test/CodeGen/X86/masked_memop.ll
Modified: llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h?rev=255629&r1=255628&r2=255629&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h (original)
+++ llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h Tue Dec 15 02:40:41 2015
@@ -2122,12 +2122,13 @@ public:
: MaskedGatherScatterSDNode(ISD::MGATHER, Order, dl, Operands, VTs, MemVT,
MMO) {
assert(getValue().getValueType() == getValueType(0) &&
- "Incompatible type of the PathThru value in MaskedGatherSDNode");
+ "Incompatible type of the PassThru value in MaskedGatherSDNode");
assert(getMask().getValueType().getVectorNumElements() ==
- getValueType(0).getVectorNumElements() &&
- "Vector width mismatch between mask and data");
- assert(getMask().getValueType().getScalarType() == MVT::i1 &&
+ getValueType(0).getVectorNumElements() &&
"Vector width mismatch between mask and data");
+ assert(getIndex().getValueType().getVectorNumElements() ==
+ getValueType(0).getVectorNumElements() &&
+ "Vector width mismatch between index and data");
}
static bool classof(const SDNode *N) {
@@ -2143,13 +2144,14 @@ public:
friend class SelectionDAG;
MaskedScatterSDNode(unsigned Order, DebugLoc dl,ArrayRef<SDValue> Operands,
SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
- : MaskedGatherScatterSDNode(ISD::MSCATTER, Order, dl, Operands, VTs,
- MemVT, MMO) {
+ : MaskedGatherScatterSDNode(ISD::MSCATTER, Order, dl, Operands, VTs, MemVT,
+ MMO) {
assert(getMask().getValueType().getVectorNumElements() ==
- getValue().getValueType().getVectorNumElements() &&
- "Vector width mismatch between mask and data");
- assert(getMask().getValueType().getScalarType() == MVT::i1 &&
+ getValue().getValueType().getVectorNumElements() &&
"Vector width mismatch between mask and data");
+ assert(getIndex().getValueType().getVectorNumElements() ==
+ getValue().getValueType().getVectorNumElements() &&
+ "Vector width mismatch between index and data");
}
static bool classof(const SDNode *N) {
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp?rev=255629&r1=255628&r2=255629&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp Tue Dec 15 02:40:41 2015
@@ -66,8 +66,11 @@ void DAGTypeLegalizer::PromoteIntegerRes
case ISD::CTTZ: Res = PromoteIntRes_CTTZ(N); break;
case ISD::EXTRACT_VECTOR_ELT:
Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break;
- case ISD::LOAD: Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N));break;
- case ISD::MLOAD: Res = PromoteIntRes_MLOAD(cast<MaskedLoadSDNode>(N));break;
+ case ISD::LOAD: Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N)); break;
+ case ISD::MLOAD: Res = PromoteIntRes_MLOAD(cast<MaskedLoadSDNode>(N));
+ break;
+ case ISD::MGATHER: Res = PromoteIntRes_MGATHER(cast<MaskedGatherSDNode>(N));
+ break;
case ISD::SELECT: Res = PromoteIntRes_SELECT(N); break;
case ISD::VSELECT: Res = PromoteIntRes_VSELECT(N); break;
case ISD::SELECT_CC: Res = PromoteIntRes_SELECT_CC(N); break;
@@ -181,7 +184,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_
N->getChain(), N->getBasePtr(),
N->getMemOperand(), N->getOrdering(),
N->getSynchScope());
- // Legalized the chain result - switch anything that used the old chain to
+ // Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
return Res;
@@ -194,7 +197,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_
N->getChain(), N->getBasePtr(),
Op2, N->getMemOperand(), N->getOrdering(),
N->getSynchScope());
- // Legalized the chain result - switch anything that used the old chain to
+ // Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
return Res;
@@ -479,7 +482,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_
SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(),
N->getMemoryVT(), N->getMemOperand());
- // Legalized the chain result - switch anything that used the old chain to
+ // Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
return Res;
@@ -489,20 +492,34 @@ SDValue DAGTypeLegalizer::PromoteIntRes_
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
SDValue ExtSrc0 = GetPromotedInteger(N->getSrc0());
- SDValue Mask = N->getMask();
- EVT NewMaskVT = getSetCCResultType(NVT);
- if (NewMaskVT != N->getMask().getValueType())
- Mask = PromoteTargetBoolean(Mask, NewMaskVT);
SDLoc dl(N);
-
SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(),
- Mask, ExtSrc0, N->getMemoryVT(),
+ N->getMask(), ExtSrc0, N->getMemoryVT(),
N->getMemOperand(), ISD::SEXTLOAD);
- // Legalized the chain result - switch anything that used the old chain to
+ // Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
return Res;
}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ SDValue ExtSrc0 = GetPromotedInteger(N->getValue());
+ assert(NVT == ExtSrc0.getValueType() &&
+ "Gather result type and the passThru agrument type should be the same");
+
+ SDLoc dl(N);
+ SDValue Ops[] = {N->getChain(), ExtSrc0, N->getMask(), N->getBasePtr(),
+ N->getIndex()};
+ SDValue Res = DAG.getMaskedGather(DAG.getVTList(NVT, MVT::Other),
+ N->getMemoryVT(), dl, Ops,
+ N->getMemOperand());
+ // Legalize the chain result - switch anything that used the old chain to
+ // use the new one.
+ ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+ return Res;
+}
+
/// Promote the overflow flag of an overflowing arithmetic node.
SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
// Simply change the return type of the boolean result.
@@ -889,6 +906,10 @@ bool DAGTypeLegalizer::PromoteIntegerOpe
OpNo); break;
case ISD::MLOAD: Res = PromoteIntOp_MLOAD(cast<MaskedLoadSDNode>(N),
OpNo); break;
+ case ISD::MGATHER: Res = PromoteIntOp_MGATHER(cast<MaskedGatherSDNode>(N),
+ OpNo); break;
+ case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast<MaskedScatterSDNode>(N),
+ OpNo); break;
case ISD::TRUNCATE: Res = PromoteIntOp_TRUNCATE(N); break;
case ISD::FP16_TO_FP:
case ISD::UINT_TO_FP: Res = PromoteIntOp_UINT_TO_FP(N); break;
@@ -1157,56 +1178,49 @@ SDValue DAGTypeLegalizer::PromoteIntOp_S
N->getMemoryVT(), N->getMemOperand());
}
-SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo){
+SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N,
+ unsigned OpNo) {
SDValue DataOp = N->getValue();
EVT DataVT = DataOp.getValueType();
SDValue Mask = N->getMask();
- EVT MaskVT = Mask.getValueType();
SDLoc dl(N);
bool TruncateStore = false;
- if (!TLI.isTypeLegal(DataVT)) {
- if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger) {
- DataOp = GetPromotedInteger(DataOp);
- if (!TLI.isTypeLegal(MaskVT))
- Mask = PromoteTargetBoolean(Mask, DataOp.getValueType());
- TruncateStore = true;
- }
+ if (OpNo == 2) {
+ // Mask comes before the data operand. If the data operand is legal, we just
+ // promote the mask.
+ // When the data operand has illegal type, we should legalize the data
+ // operand first. The mask will be promoted/splitted/widened according to
+ // the data operand type.
+ if (TLI.isTypeLegal(DataVT))
+ Mask = PromoteTargetBoolean(Mask, DataVT);
else {
- assert(getTypeAction(DataVT) == TargetLowering::TypeWidenVector &&
- "Unexpected data legalization in MSTORE");
- DataOp = GetWidenedVector(DataOp);
-
- if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
- Mask = GetWidenedVector(Mask);
- else {
- EVT BoolVT = getSetCCResultType(DataOp.getValueType());
+ if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger)
+ return PromoteIntOp_MSTORE(N, 3);
- // We can't use ModifyToType() because we should fill the mask with
- // zeroes
- unsigned WidenNumElts = BoolVT.getVectorNumElements();
- unsigned MaskNumElts = MaskVT.getVectorNumElements();
-
- unsigned NumConcat = WidenNumElts / MaskNumElts;
- SmallVector<SDValue, 16> Ops(NumConcat);
- SDValue ZeroVal = DAG.getConstant(0, dl, MaskVT);
- Ops[0] = Mask;
- for (unsigned i = 1; i != NumConcat; ++i)
- Ops[i] = ZeroVal;
+ else if (getTypeAction(DataVT) == TargetLowering::TypeWidenVector)
+ return WidenVecOp_MSTORE(N, 3);
- Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
+ else {
+ assert (getTypeAction(DataVT) == TargetLowering::TypeSplitVector);
+ return SplitVecOp_MSTORE(N, 3);
}
}
+ } else { // Data operand
+ assert(OpNo == 3 && "Unexpected operand for promotion");
+ DataOp = GetPromotedInteger(DataOp);
+ Mask = PromoteTargetBoolean(Mask, DataOp.getValueType());
+ TruncateStore = true;
}
- else
- Mask = PromoteTargetBoolean(N->getMask(), DataOp.getValueType());
+
return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask,
N->getMemoryVT(), N->getMemOperand(),
TruncateStore);
}
-SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo){
+SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N,
+ unsigned OpNo) {
assert(OpNo == 2 && "Only know how to promote the mask!");
EVT DataVT = N->getValueType(0);
SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
@@ -1215,6 +1229,31 @@ SDValue DAGTypeLegalizer::PromoteIntOp_M
return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
}
+SDValue DAGTypeLegalizer::PromoteIntOp_MGATHER(MaskedGatherSDNode *N,
+ unsigned OpNo) {
+
+ SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+ if (OpNo == 2) {
+ // The Mask
+ EVT DataVT = N->getValueType(0);
+ NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
+ } else
+ NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
+ return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
+ unsigned OpNo) {
+ SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+ if (OpNo == 2) {
+ // The Mask
+ EVT DataVT = N->getValue().getValueType();
+ NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
+ } else
+ NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
+ return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
SDValue Op = GetPromotedInteger(N->getOperand(0));
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op);
@@ -2071,7 +2110,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD
}
}
- // Legalized the chain result - switch anything that used the old chain to
+ // Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Ch);
}
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp?rev=255629&r1=255628&r2=255629&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp Tue Dec 15 02:40:41 2015
@@ -1127,6 +1127,23 @@ SDValue DAGTypeLegalizer::PromoteTargetB
return DAG.getNode(ExtendCode, dl, BoolVT, Bool);
}
+/// WidenTargetBoolean - Widen the given target boolean to a target boolean
+/// of the given type. The boolean vector is widened and then promoted to match
+/// the target boolean type of the given ValVT.
+SDValue DAGTypeLegalizer::WidenTargetBoolean(SDValue Bool, EVT ValVT,
+ bool WithZeroes) {
+ SDLoc dl(Bool);
+ EVT BoolVT = Bool.getValueType();
+
+ assert(ValVT.getVectorNumElements() > BoolVT.getVectorNumElements() &&
+ TLI.isTypeLegal(ValVT) &&
+ "Unexpected types in WidenTargetBoolean");
+ EVT WideVT = EVT::getVectorVT(*DAG.getContext(), BoolVT.getScalarType(),
+ ValVT.getVectorNumElements());
+ Bool = ModifyToType(Bool, WideVT, WithZeroes);
+ return PromoteTargetBoolean(Bool, ValVT);
+}
+
/// SplitInteger - Return the lower LoVT bits of Op in Lo and the upper HiVT
/// bits in Hi.
void DAGTypeLegalizer::SplitInteger(SDValue Op,
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h?rev=255629&r1=255628&r2=255629&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h Tue Dec 15 02:40:41 2015
@@ -187,6 +187,11 @@ private:
std::pair<SDValue, SDValue> ExpandAtomic(SDNode *Node);
SDValue PromoteTargetBoolean(SDValue Bool, EVT ValVT);
+
+ /// Modify Bit Vector to match SetCC result type of ValVT.
+ /// The bit vector is widened with zeroes when WithZeroes is true.
+ SDValue WidenTargetBoolean(SDValue Bool, EVT ValVT, bool WithZeroes = false);
+
void ReplaceValueWith(SDValue From, SDValue To);
void SplitInteger(SDValue Op, SDValue &Lo, SDValue &Hi);
void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT,
@@ -261,6 +266,7 @@ private:
SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
SDValue PromoteIntRes_LOAD(LoadSDNode *N);
SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N);
+ SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N);
SDValue PromoteIntRes_Overflow(SDNode *N);
SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
SDValue PromoteIntRes_SDIV(SDNode *N);
@@ -307,6 +313,8 @@ private:
SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N);
SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
+ SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
+ SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo);
void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
@@ -710,6 +718,7 @@ private:
SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
SDValue WidenVecRes_LOAD(SDNode* N);
SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);
+ SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N);
SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);
SDValue WidenVecRes_SELECT(SDNode* N);
SDValue WidenVecRes_SELECT_CC(SDNode* N);
@@ -737,6 +746,7 @@ private:
SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N);
SDValue WidenVecOp_STORE(SDNode* N);
SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo);
+ SDValue WidenVecOp_MSCATTER(SDNode* N, unsigned OpNo);
SDValue WidenVecOp_SETCC(SDNode* N);
SDValue WidenVecOp_Convert(SDNode *N);
@@ -776,8 +786,10 @@ private:
/// Modifies a vector input (widen or narrows) to a vector of NVT. The
/// input vector must have the same element type as NVT.
- SDValue ModifyToType(SDValue InOp, EVT WidenVT);
-
+ /// When FillWithZeroes is "on" the vector will be widened with
+ /// zeroes.
+ /// By default, the vector will be widened with undefined values.
+ SDValue ModifyToType(SDValue InOp, EVT NVT, bool FillWithZeroes = false);
//===--------------------------------------------------------------------===//
// Generic Splitting: LegalizeTypesGeneric.cpp
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp?rev=255629&r1=255628&r2=255629&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp Tue Dec 15 02:40:41 2015
@@ -235,7 +235,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRe
N->isInvariant(), N->getOriginalAlignment(),
N->getAAInfo());
- // Legalized the chain result - switch anything that used the old chain to
+ // Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
return Result;
@@ -1020,7 +1020,7 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(
Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
Hi.getValue(1));
- // Legalized the chain result - switch anything that used the old chain to
+ // Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(LD, 1), Ch);
}
@@ -1034,6 +1034,7 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD
SDValue Ch = MLD->getChain();
SDValue Ptr = MLD->getBasePtr();
SDValue Mask = MLD->getMask();
+ SDValue Src0 = MLD->getSrc0();
unsigned Alignment = MLD->getOriginalAlignment();
ISD::LoadExtType ExtType = MLD->getExtensionType();
@@ -1043,16 +1044,22 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD
(Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
Alignment/2 : Alignment;
+ // Split Mask operand
SDValue MaskLo, MaskHi;
- std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+ if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+ GetSplitVector(Mask, MaskLo, MaskHi);
+ else
+ std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
EVT MemoryVT = MLD->getMemoryVT();
EVT LoMemVT, HiMemVT;
std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
- SDValue Src0 = MLD->getSrc0();
SDValue Src0Lo, Src0Hi;
- std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
+ if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
+ GetSplitVector(Src0, Src0Lo, Src0Hi);
+ else
+ std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
MachineMemOperand *MMO = DAG.getMachineFunction().
getMachineMemOperand(MLD->getPointerInfo(),
@@ -1080,7 +1087,7 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD
Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
Hi.getValue(1));
- // Legalized the chain result - switch anything that used the old chain to
+ // Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(MLD, 1), Ch);
@@ -1095,20 +1102,33 @@ void DAGTypeLegalizer::SplitVecRes_MGATH
SDValue Ch = MGT->getChain();
SDValue Ptr = MGT->getBasePtr();
SDValue Mask = MGT->getMask();
+ SDValue Src0 = MGT->getValue();
+ SDValue Index = MGT->getIndex();
unsigned Alignment = MGT->getOriginalAlignment();
+ // Split Mask operand
SDValue MaskLo, MaskHi;
- std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+ if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+ GetSplitVector(Mask, MaskLo, MaskHi);
+ else
+ std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
EVT MemoryVT = MGT->getMemoryVT();
EVT LoMemVT, HiMemVT;
+ // Split MemoryVT
std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
SDValue Src0Lo, Src0Hi;
- std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(MGT->getValue(), dl);
+ if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
+ GetSplitVector(Src0, Src0Lo, Src0Hi);
+ else
+ std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
SDValue IndexHi, IndexLo;
- std::tie(IndexLo, IndexHi) = DAG.SplitVector(MGT->getIndex(), dl);
+ if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
+ GetSplitVector(Index, IndexLo, IndexHi);
+ else
+ std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
MachineMemOperand *MMO = DAG.getMachineFunction().
getMachineMemOperand(MGT->getPointerInfo(),
@@ -1128,7 +1148,7 @@ void DAGTypeLegalizer::SplitVecRes_MGATH
Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
Hi.getValue(1));
- // Legalized the chain result - switch anything that used the old chain to
+ // Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(MGT, 1), Ch);
}
@@ -1599,23 +1619,31 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGA
SDValue Ptr = MGT->getBasePtr();
SDValue Index = MGT->getIndex();
SDValue Mask = MGT->getMask();
+ SDValue Src0 = MGT->getValue();
unsigned Alignment = MGT->getOriginalAlignment();
SDValue MaskLo, MaskHi;
- std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+ if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+ // Split Mask operand
+ GetSplitVector(Mask, MaskLo, MaskHi);
+ else
+ std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
EVT MemoryVT = MGT->getMemoryVT();
EVT LoMemVT, HiMemVT;
std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
SDValue Src0Lo, Src0Hi;
- std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(MGT->getValue(), dl);
+ if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
+ GetSplitVector(Src0, Src0Lo, Src0Hi);
+ else
+ std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
SDValue IndexHi, IndexLo;
- if (Index.getNode())
- std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
+ if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
+ GetSplitVector(Index, IndexLo, IndexHi);
else
- IndexLo = IndexHi = Index;
+ std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
MachineMemOperand *MMO = DAG.getMachineFunction().
getMachineMemOperand(MGT->getPointerInfo(),
@@ -1641,7 +1669,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGA
Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
Hi.getValue(1));
- // Legalized the chain result - switch anything that used the old chain to
+ // Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(MGT, 1), Ch);
@@ -1665,9 +1693,21 @@ SDValue DAGTypeLegalizer::SplitVecOp_MST
std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
SDValue DataLo, DataHi;
- GetSplitVector(Data, DataLo, DataHi);
+ if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
+ // Split Data operand
+ GetSplitVector(Data, DataLo, DataHi);
+ else
+ std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
+
SDValue MaskLo, MaskHi;
- GetSplitVector(Mask, MaskLo, MaskHi);
+ if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+ // Split Mask operand
+ GetSplitVector(Mask, MaskLo, MaskHi);
+ else
+ std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+
+ MaskLo = PromoteTargetBoolean(MaskLo, DataLo.getValueType());
+ MaskHi = PromoteTargetBoolean(MaskHi, DataHi.getValueType());
// if Alignment is equal to the vector size,
// take the half of it for the second part
@@ -1712,25 +1752,29 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSC
unsigned Alignment = N->getOriginalAlignment();
SDLoc DL(N);
+ // Split all operands
EVT LoMemVT, HiMemVT;
std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
SDValue DataLo, DataHi;
- GetSplitVector(Data, DataLo, DataHi);
- SDValue MaskLo, MaskHi;
- GetSplitVector(Mask, MaskLo, MaskHi);
+ if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
+ // Split Data operand
+ GetSplitVector(Data, DataLo, DataHi);
+ else
+ std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
- SDValue PtrLo, PtrHi;
- if (Ptr.getValueType().isVector()) // gather form vector of pointers
- std::tie(PtrLo, PtrHi) = DAG.SplitVector(Ptr, DL);
+ SDValue MaskLo, MaskHi;
+ if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+ // Split Mask operand
+ GetSplitVector(Mask, MaskLo, MaskHi);
else
- PtrLo = PtrHi = Ptr;
+ std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
SDValue IndexHi, IndexLo;
- if (Index.getNode())
- std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
+ if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
+ GetSplitVector(Index, IndexLo, IndexHi);
else
- IndexLo = IndexHi = Index;
+ std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
SDValue Lo, Hi;
MachineMemOperand *MMO = DAG.getMachineFunction().
@@ -1738,7 +1782,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSC
MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
Alignment, N->getAAInfo(), N->getRanges());
- SDValue OpsLo[] = {Ch, DataLo, MaskLo, PtrLo, IndexLo};
+ SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo};
Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
DL, OpsLo, MMO);
@@ -1747,7 +1791,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSC
MachineMemOperand::MOStore, HiMemVT.getStoreSize(),
Alignment, N->getAAInfo(), N->getRanges());
- SDValue OpsHi[] = {Ch, DataHi, MaskHi, PtrHi, IndexHi};
+ SDValue OpsHi[] = {Ch, DataHi, MaskHi, Ptr, IndexHi};
Hi = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
DL, OpsHi, MMO);
@@ -1975,6 +2019,9 @@ void DAGTypeLegalizer::WidenVectorResult
case ISD::MLOAD:
Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N));
break;
+ case ISD::MGATHER:
+ Res = WidenVecRes_MGATHER(cast<MaskedGatherSDNode>(N));
+ break;
case ISD::ADD:
case ISD::AND:
@@ -2728,7 +2775,35 @@ SDValue DAGTypeLegalizer::WidenVecRes_ML
SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(),
Mask, Src0, N->getMemoryVT(),
N->getMemOperand(), ExtType);
- // Legalized the chain result - switch anything that used the old chain to
+ // Legalize the chain result - switch anything that used the old chain to
+ // use the new one.
+ ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+ return Res;
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) {
+
+ EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ SDValue Mask = N->getMask();
+ SDValue Src0 = GetWidenedVector(N->getValue());
+ unsigned NumElts = WideVT.getVectorNumElements();
+ SDLoc dl(N);
+
+ // The mask should be widened as well
+ Mask = WidenTargetBoolean(Mask, WideVT, true);
+
+ // Widen the Index operand
+ SDValue Index = N->getIndex();
+ EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
+ Index.getValueType().getScalarType(),
+ NumElts);
+ Index = ModifyToType(Index, WideIndexVT);
+ SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+ SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other),
+ N->getMemoryVT(), dl, Ops,
+ N->getMemOperand());
+
+ // Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
return Res;
@@ -2890,6 +2965,7 @@ bool DAGTypeLegalizer::WidenVectorOperan
case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break;
case ISD::STORE: Res = WidenVecOp_STORE(N); break;
case ISD::MSTORE: Res = WidenVecOp_MSTORE(N, OpNo); break;
+ case ISD::MSCATTER: Res = WidenVecOp_MSCATTER(N, OpNo); break;
case ISD::SETCC: Res = WidenVecOp_SETCC(N); break;
case ISD::FCOPYSIGN: Res = WidenVecOp_FCOPYSIGN(N); break;
@@ -3137,6 +3213,34 @@ SDValue DAGTypeLegalizer::WidenVecOp_MST
false);
}
+SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
+ assert(OpNo == 1 && "Can widen only data operand of mscatter");
+ MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
+ SDValue DataOp = MSC->getValue();
+ SDValue Mask = MSC->getMask();
+
+ // Widen the value
+ SDValue WideVal = GetWidenedVector(DataOp);
+ EVT WideVT = WideVal.getValueType();
+ unsigned NumElts = WideVal.getValueType().getVectorNumElements();
+ SDLoc dl(N);
+
+ // The mask should be widened as well
+ Mask = WidenTargetBoolean(Mask, WideVT, true);
+
+ // Widen index
+ SDValue Index = MSC->getIndex();
+ EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
+ Index.getValueType().getScalarType(),
+ NumElts);
+ Index = ModifyToType(Index, WideIndexVT);
+
+ SDValue Ops[] = {MSC->getChain(), WideVal, Mask, MSC->getBasePtr(), Index};
+ return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
+ MSC->getMemoryVT(), dl, Ops,
+ MSC->getMemOperand());
+}
+
SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
SDValue InOp0 = GetWidenedVector(N->getOperand(0));
SDValue InOp1 = GetWidenedVector(N->getOperand(1));
@@ -3600,7 +3704,9 @@ DAGTypeLegalizer::GenWidenVectorTruncSto
/// Modifies a vector input (widen or narrows) to a vector of NVT. The
/// input vector must have the same element type as NVT.
-SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) {
+/// FillWithZeroes specifies that the vector should be widened with zeroes.
+SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
+ bool FillWithZeroes) {
// Note that InOp might have been widened so it might already have
// the right width or it might need be narrowed.
EVT InVT = InOp.getValueType();
@@ -3617,10 +3723,11 @@ SDValue DAGTypeLegalizer::ModifyToType(S
if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) {
unsigned NumConcat = WidenNumElts / InNumElts;
SmallVector<SDValue, 16> Ops(NumConcat);
- SDValue UndefVal = DAG.getUNDEF(InVT);
+ SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, InVT) :
+ DAG.getUNDEF(InVT);
Ops[0] = InOp;
for (unsigned i = 1; i != NumConcat; ++i)
- Ops[i] = UndefVal;
+ Ops[i] = FillVal;
return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, Ops);
}
@@ -3640,8 +3747,9 @@ SDValue DAGTypeLegalizer::ModifyToType(S
ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
- SDValue UndefVal = DAG.getUNDEF(EltVT);
+ SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
+ DAG.getUNDEF(EltVT);
for ( ; Idx < WidenNumElts; ++Idx)
- Ops[Idx] = UndefVal;
+ Ops[Idx] = FillVal;
return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
}
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=255629&r1=255628&r2=255629&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Dec 15 02:40:41 2015
@@ -1579,7 +1579,7 @@ X86TargetLowering::X86TargetLowering(con
setOperationAction(ISD::OR, VT, Legal);
setOperationAction(ISD::XOR, VT, Legal);
}
- if (EltSize >= 32 && VT.getSizeInBits() <= 512) {
+ if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) {
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
}
@@ -1605,6 +1605,8 @@ X86TargetLowering::X86TargetLowering(con
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
+ setOperationAction(ISD::MGATHER, VT, Legal);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
}
}
for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
@@ -1813,6 +1815,8 @@ X86TargetLowering::X86TargetLowering(con
setTargetDAGCombine(ISD::BUILD_VECTOR);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
+ setTargetDAGCombine(ISD::MSCATTER);
+ setTargetDAGCombine(ISD::MGATHER);
computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -19760,6 +19764,16 @@ static SDValue ExtendToType(SDValue InOp
EVT EltVT = NVT.getVectorElementType();
SDLoc dl(InOp);
+ if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
+ InOp.getNumOperands() == 2) {
+ SDValue N1 = InOp.getOperand(1);
+ if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
+ N1.isUndef()) {
+ InOp = InOp.getOperand(0);
+ InVT = InOp.getSimpleValueType();
+ InNumElts = InVT.getVectorNumElements();
+ }
+ }
if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
SmallVector<SDValue, 16> Ops;
@@ -19783,28 +19797,93 @@ static SDValue LowerMSCATTER(SDValue Op,
assert(Subtarget->hasAVX512() &&
"MGATHER/MSCATTER are supported on AVX-512 arch only");
+ // X86 scatter kills mask register, so its type should be added to
+ // the list of return values.
+ // If the "scatter" has 2 return values, it is already handled.
+ if (Op.getNode()->getNumValues() == 2)
+ return Op;
+
MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
- MVT VT = N->getValue().getSimpleValueType();
+ SDValue Src = N->getValue();
+ MVT VT = Src.getSimpleValueType();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
SDLoc dl(Op);
- // X86 scatter kills mask register, so its type should be added to
- // the list of return values
- if (N->getNumValues() == 1) {
- SDValue Index = N->getIndex();
- if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
- !Index.getSimpleValueType().is512BitVector())
- Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
-
- SDVTList VTs = DAG.getVTList(N->getMask().getValueType(), MVT::Other);
- SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
- N->getOperand(3), Index };
-
- SDValue NewScatter = DAG.getMaskedScatter(VTs, VT, dl, Ops, N->getMemOperand());
- DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
- return SDValue(NewScatter.getNode(), 0);
+ SDValue NewScatter;
+ SDValue Index = N->getIndex();
+ SDValue Mask = N->getMask();
+ SDValue Chain = N->getChain();
+ SDValue BasePtr = N->getBasePtr();
+ MVT MemVT = N->getMemoryVT().getSimpleVT();
+ MVT IndexVT = Index.getSimpleValueType();
+ MVT MaskVT = Mask.getSimpleValueType();
+
+ if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
+ // The v2i32 value was promoted to v2i64.
+ // Now we "redo" the type legalizer's work and widen the original
+ // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
+ // with a shuffle.
+ assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
+ "Unexpected memory type");
+ int ShuffleMask[] = {0, 2, -1, -1};
+ Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
+ DAG.getUNDEF(MVT::v4i32), ShuffleMask);
+ // Now we have 4 elements instead of 2.
+ // Expand the index.
+ MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
+ Index = ExtendToType(Index, NewIndexVT, DAG);
+
+ // Expand the mask with zeroes
+ // Mask may be <2 x i64> or <2 x i1> at this moment
+ assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
+ "Unexpected mask type");
+ MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
+ Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
+ VT = MVT::v4i32;
}
- return Op;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
+ !Index.getSimpleValueType().is512BitVector()) {
+ // AVX512F supports only 512-bit vectors. Or data or index should
+ // be 512 bit wide. If now the both index and data are 256-bit, but
+ // the vector contains 8 elements, we just sign-extend the index
+ if (IndexVT == MVT::v8i32)
+ // Just extend index
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+ else {
+ // The minimal number of elts in scatter is 8
+ NumElts = 8;
+ // Index
+ MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
+ // Use original index here, do not modify the index twice
+ Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
+ if (IndexVT.getScalarType() == MVT::i32)
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+
+ // Mask
+ // At this point we have promoted mask operand
+ assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
+ MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
+ // Use the original mask here, do not modify the mask twice
+ Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
+
+ // The value that should be stored
+ MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
+ Src = ExtendToType(Src, NewVT, DAG);
+ }
+ }
+ // If the mask is "wide" at this point - truncate it to i1 vector
+ MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+ Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
+
+ // The mask is killed by scatter, add it to the values
+ SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
+ SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
+ NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
+ N->getMemOperand());
+ DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
+ return SDValue(NewScatter.getNode(), 0);
}
static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget,
@@ -19869,17 +19948,59 @@ static SDValue LowerMGATHER(SDValue Op,
"MGATHER/MSCATTER are supported on AVX-512 arch only");
MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
+ SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
+ SDValue Index = N->getIndex();
+ SDValue Mask = N->getMask();
+ SDValue Src0 = N->getValue();
+ MVT IndexVT = Index.getSimpleValueType();
+ MVT MaskVT = Mask.getSimpleValueType();
+
+ unsigned NumElts = VT.getVectorNumElements();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
- SDLoc dl(Op);
- SDValue Index = N->getIndex();
if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
!Index.getSimpleValueType().is512BitVector()) {
- Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
- SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
- N->getOperand(3), Index };
- DAG.UpdateNodeOperands(N, Ops);
+ // AVX512F supports only 512-bit vectors. Or data or index should
+ // be 512 bit wide. If now the both index and data are 256-bit, but
+ // the vector contains 8 elements, we just sign-extend the index
+ if (NumElts == 8) {
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), Index };
+ DAG.UpdateNodeOperands(N, Ops);
+ return Op;
+ }
+
+ // Minimal number of elements in Gather
+ NumElts = 8;
+ // Index
+ MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
+ Index = ExtendToType(Index, NewIndexVT, DAG);
+ if (IndexVT.getScalarType() == MVT::i32)
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+
+ // Mask
+ MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
+ // At this point we have promoted mask operand
+ assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
+ MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
+ Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
+ Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
+
+ // The pass-thru value
+ MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
+ Src0 = ExtendToType(Src0, NewVT, DAG);
+
+ SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+ SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
+ N->getMemoryVT(), dl, Ops,
+ N->getMemOperand());
+ SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+ NewGather.getValue(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue RetOps[] = {Exract, NewGather.getValue(1)};
+ return DAG.getMergeValues(RetOps, dl);
}
return Op;
}
@@ -26907,6 +27028,20 @@ static SDValue PerformBLENDICombine(SDNo
return SDValue();
}
+static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ // Gather and Scatter instructions use k-registers for masks. The type of
+ // the masks is v*i1. So the mask will be truncated anyway.
+ // The SIGN_EXTEND_INREG my be dropped.
+ SDValue Mask = N->getOperand(2);
+ if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+ NewOps[2] = Mask.getOperand(0);
+ DAG.UpdateNodeOperands(N, NewOps);
+ }
+ return SDValue();
+}
+
// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
// as "sbb reg,reg", since it can be extended without zext and produces
// an all-ones bit which is more useful than 0/1 in some cases.
@@ -27348,6 +27483,8 @@ SDValue X86TargetLowering::PerformDAGCom
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG);
+ case ISD::MGATHER:
+ case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG);
}
return SDValue();
Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=255629&r1=255628&r2=255629&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Tue Dec 15 02:40:41 2015
@@ -2176,17 +2176,19 @@ let Predicates = [HasAVX512] in {
(EXTRACT_SUBREG
(AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
sub_16bit)>;
- def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
- (COPY_TO_REGCLASS VK1:$src, VK16)>;
- def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
- (COPY_TO_REGCLASS VK1:$src, VK8)>;
-}
-let Predicates = [HasBWI] in {
- def : Pat<(v32i1 (scalar_to_vector VK1:$src)),
- (COPY_TO_REGCLASS VK1:$src, VK32)>;
- def : Pat<(v64i1 (scalar_to_vector VK1:$src)),
- (COPY_TO_REGCLASS VK1:$src, VK64)>;
}
+def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK16)>;
+def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK8)>;
+def : Pat<(v4i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK4)>;
+def : Pat<(v2i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK2)>;
+def : Pat<(v32i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK32)>;
+def : Pat<(v64i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK64)>;
// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
@@ -2489,6 +2491,9 @@ def : Pat<(v8i1 (extract_subvector (v16i
def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))),
(v16i1 (COPY_TO_REGCLASS VK32:$src, VK16))>;
+def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))),
+ (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>;
+
def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))),
(v32i1 (COPY_TO_REGCLASS VK64:$src, VK32))>;
@@ -2497,6 +2502,7 @@ def : Pat<(v32i1 (extract_subvector (v64
def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
(v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>;
+
def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
(v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>;
Modified: llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll?rev=255629&r1=255628&r2=255629&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll Tue Dec 15 02:40:41 2015
@@ -1,29 +1,51 @@
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=KNL
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX_32
; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
-; KNL-LABEL: test1
-; KNL: kxnorw %k1, %k1, %k1
-; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SCALAR-LABEL: test1
-; SCALAR: extractelement <16 x float*>
+; SCALAR: extractelement <16 x float*>
; SCALAR-NEXT: load float
; SCALAR-NEXT: insertelement <16 x float>
; SCALAR-NEXT: extractelement <16 x float*>
; SCALAR-NEXT: load float
define <16 x float> @test1(float* %base, <16 x i32> %ind) {
+; KNL_64-LABEL: test1:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kxnorw %k1, %k1, %k1
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test1:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: kxnorw %k1, %k1, %k1
+; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test1:
+; SKX: # BB#0:
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
-
+
%res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
ret <16 x float>%res
}
@@ -31,23 +53,41 @@ define <16 x float> @test1(float* %base,
declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
-
-; KNL-LABEL: test2
-; KNL: kmovw %esi, %k1
-; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+
; SCALAR-LABEL: test2
-; SCALAR: extractelement <16 x float*>
+; SCALAR: extractelement <16 x float*>
; SCALAR-NEXT: load float
; SCALAR-NEXT: insertelement <16 x float>
; SCALAR-NEXT: br label %else
; SCALAR: else:
-; SCALAR-NEXT: %res.phi.else = phi
+; SCALAR-NEXT: %res.phi.else = phi
; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
; SCALAR-NEXT: %ToLoad1 = icmp eq i1 %Mask1, true
; SCALAR-NEXT: br i1 %ToLoad1, label %cond.load1, label %else2
define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
+; KNL_64-LABEL: test2:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kmovw %esi, %k1
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test2:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test2:
+; SKX: # BB#0:
+; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
@@ -59,10 +99,28 @@ define <16 x float> @test2(float* %base,
ret <16 x float> %res
}
-; KNL-LABEL: test3
-; KNL: kmovw %esi, %k1
-; KNL: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
+; KNL_64-LABEL: test3:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kmovw %esi, %k1
+; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test3:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test3:
+; SKX: # BB#0:
+; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
%broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
%broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
@@ -74,13 +132,38 @@ define <16 x i32> @test3(i32* %base, <16
ret <16 x i32> %res
}
-; KNL-LABEL: test4
-; KNL: kmovw %esi, %k1
-; KNL: kmovw
-; KNL: vpgatherdd
-; KNL: vpgatherdd
define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
+; KNL_64-LABEL: test4:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kmovw %esi, %k1
+; KNL_64-NEXT: kmovw %k1, %k2
+; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm2
+; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
+; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test4:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: kmovw %k1, %k2
+; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm2
+; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
+; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test4:
+; SKX: # BB#0:
+; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: kmovw %k1, %k2
+; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
+; SKX-NEXT: vmovaps %zmm1, %zmm2
+; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
+; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0
+; SKX-NEXT: retq
%broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
%broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
@@ -93,10 +176,6 @@ define <16 x i32> @test4(i32* %base, <16
ret <16 x i32> %res
}
-; KNL-LABEL: test5
-; KNL: kmovw %k1, %k2
-; KNL: vpscatterdd {{.*}}%k2
-; KNL: vpscatterdd {{.*}}%k1
; SCALAR-LABEL: test5
; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i32 0
@@ -113,6 +192,30 @@ define <16 x i32> @test4(i32* %base, <16
; SCALAR-NEXT: br i1 %ToStore1, label %cond.store1, label %else2
define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
+; KNL_64-LABEL: test5:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kmovw %esi, %k1
+; KNL_64-NEXT: kmovw %k1, %k2
+; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
+; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test5:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: kmovw %k1, %k2
+; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
+; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test5:
+; SKX: # BB#0:
+; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: kmovw %k1, %k2
+; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
+; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
+; SKX-NEXT: retq
%broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
%broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
@@ -127,11 +230,6 @@ define void @test5(i32* %base, <16 x i32
declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
-; KNL-LABEL: test6
-; KNL: kxnorw %k1, %k1, %k1
-; KNL: kxnorw %k2, %k2, %k2
-; KNL: vpgatherqd (,%zmm{{.*}}), %ymm{{.*}} {%k2}
-; KNL: vpscatterqd %ymm{{.*}}, (,%zmm{{.*}}) {%k1}
; SCALAR-LABEL: test6
; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4
@@ -143,6 +241,33 @@ declare void @llvm.masked.scatter.v16i32
; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4
define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
+; KNL_64-LABEL: test6:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kxnorw %k1, %k1, %k1
+; KNL_64-NEXT: kxnorw %k2, %k2, %k2
+; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
+; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test6:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: kxnorw %k1, %k1, %k1
+; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2
+; KNL_32-NEXT: kxnorw %k2, %k2, %k2
+; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2}
+; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test6:
+; SKX: # BB#0:
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: kxnorw %k2, %k2, %k2
+; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
+; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: retq
%a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
@@ -150,13 +275,41 @@ define <8 x i32> @test6(<8 x i32>%a1, <8
ret <8 x i32>%a
}
-; In this case the index should be promoted to <8 x i64> for KNL
-; KNL-LABEL: test7
-; KNL: vpmovsxdq %ymm0, %zmm0
-; KNL: kmovw %k1, %k2
-; KNL: vpgatherqd {{.*}} {%k2}
-; KNL: vpgatherqd {{.*}} {%k1}
define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
+;
+; KNL_64-LABEL: test7:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: movzbl %sil, %eax
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_64-NEXT: kmovw %k1, %k2
+; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm2
+; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
+; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test7:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT: kmovw %k1, %k2
+; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm2
+; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
+; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test7:
+; SKX: # BB#0:
+; SKX-NEXT: kmovb %esi, %k1
+; SKX-NEXT: kmovw %k1, %k2
+; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
+; SKX-NEXT: vmovaps %zmm1, %zmm2
+; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
+; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0
+; SKX-NEXT: retq
%broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
%broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
@@ -171,15 +324,57 @@ define <8 x i32> @test7(i32* %base, <8 x
; No uniform base in this case, index <8 x i64> contains addresses,
; each gather call will be split into two
-; KNL-LABEL: test8
-; KNL: kshiftrw $8, %k1, %k2
-; KNL: vpgatherqd
-; KNL: vpgatherqd
-; KNL: vinserti64x4
-; KNL: vpgatherqd
-; KNL: vpgatherqd
-; KNL: vinserti64x4
define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
+; KNL_64-LABEL: test8:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kmovw %edi, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: kmovw %k2, %k3
+; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
+; KNL_64-NEXT: kmovw %k1, %k3
+; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
+; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
+; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
+; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
+; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
+; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test8:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: kmovw %k1, %k2
+; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm2
+; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
+; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test8:
+; SKX: # BB#0:
+; SKX-NEXT: kmovw %edi, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: kmovw %k2, %k3
+; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
+; SKX-NEXT: kmovw %k1, %k3
+; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
+; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm4
+; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
+; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
+; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test8:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; SKX_32-NEXT: kmovw %k1, %k2
+; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
+; SKX_32-NEXT: vmovaps %zmm1, %zmm2
+; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
+; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
+; SKX_32-NEXT: retl
+
%imask = bitcast i16 %mask to <16 x i1>
%gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
%gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
@@ -193,18 +388,60 @@ define <16 x i32> @test8(<16 x i32*> %pt
; Masked gather for agregate types
; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
-; KNL-LABEL: test9
-; KNL: vpbroadcastq %rdi, %zmm
-; KNL: vpmovsxdq
-; KNL: vpbroadcastq
-; KNL: vpmuludq
-; KNL: vpaddq
-; KNL: vpaddq
-; KNL: vpaddq
-; KNL: vpaddq
-; KNL: vpgatherqd (,%zmm
define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
+; KNL_64-LABEL: test9:
+; KNL_64: # BB#0: # %entry
+; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
+; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
+; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4
+; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1
+; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
+; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1
+; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1
+; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
+; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
+; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
+; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
+; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
+; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; KNL_64-NEXT: kxnorw %k1, %k1, %k1
+; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test9:
+; KNL_32: # BB#0: # %entry
+; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
+; KNL_32-NEXT: vpbroadcastd .LCPI8_0, %ymm3
+; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
+; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
+; KNL_32-NEXT: vpbroadcastd .LCPI8_1, %ymm3
+; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
+; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; KNL_32-NEXT: vpbroadcastd .LCPI8_2, %ymm1
+; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
+; KNL_32-NEXT: kxnorw %k1, %k1, %k1
+; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test9:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vpbroadcastq %rdi, %zmm2
+; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
+; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; SKX-NEXT: retq
entry:
%broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
%broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
@@ -214,17 +451,59 @@ entry:
ret <8 x i32> %res
}
-; KNL-LABEL: test10
-; KNL: vpbroadcastq %rdi, %zmm
-; KNL: vpmovsxdq
-; KNL: vpbroadcastq
-; KNL: vpmuludq
-; KNL: vpaddq
-; KNL: vpaddq
-; KNL: vpaddq
-; KNL: vpaddq
-; KNL: vpgatherqd (,%zmm
define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
+; KNL_64-LABEL: test10:
+; KNL_64: # BB#0: # %entry
+; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
+; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
+; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4
+; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1
+; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
+; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1
+; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1
+; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
+; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
+; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
+; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
+; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
+; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; KNL_64-NEXT: kxnorw %k1, %k1, %k1
+; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test10:
+; KNL_32: # BB#0: # %entry
+; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
+; KNL_32-NEXT: vpbroadcastd .LCPI9_0, %ymm3
+; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
+; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
+; KNL_32-NEXT: vpbroadcastd .LCPI9_1, %ymm3
+; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
+; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; KNL_32-NEXT: vpbroadcastd .LCPI9_2, %ymm1
+; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
+; KNL_32-NEXT: kxnorw %k1, %k1, %k1
+; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test10:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vpbroadcastq %rdi, %zmm2
+; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
+; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; SKX-NEXT: retq
entry:
%broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
%broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
@@ -235,10 +514,28 @@ entry:
}
; Splat index in GEP, requires broadcast
-; KNL-LABEL: test11
-; KNL: vpbroadcastd %esi, %zmm
-; KNL: vgatherdps (%rdi,%zmm
define <16 x float> @test11(float* %base, i32 %ind) {
+; KNL_64-LABEL: test11:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpbroadcastd %esi, %zmm1
+; KNL_64-NEXT: kxnorw %k1, %k1, %k1
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test11:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm1
+; KNL_32-NEXT: kxnorw %k1, %k1, %k1
+; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test11:
+; SKX: # BB#0:
+; SKX-NEXT: vpbroadcastd %esi, %zmm1
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
+; SKX-NEXT: retq
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
@@ -250,10 +547,28 @@ define <16 x float> @test11(float* %base
}
; We are checking the uniform base here. It is taken directly from input to vgatherdps
-; KNL-LABEL: test12
-; KNL: kxnorw %k1, %k1, %k1
-; KNL: vgatherdps (%rdi,%zmm
define <16 x float> @test12(float* %base, <16 x i32> %ind) {
+; KNL_64-LABEL: test12:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kxnorw %k1, %k1, %k1
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test12:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: kxnorw %k1, %k1, %k1
+; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test12:
+; SKX: # BB#0:
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
@@ -263,10 +578,25 @@ define <16 x float> @test12(float* %base
}
; The same as the previous, but the mask is undefined
-; KNL-LABEL: test13
-; KNL-NOT: kxnorw
-; KNL: vgatherdps (%rdi,%zmm
define <16 x float> @test13(float* %base, <16 x i32> %ind) {
+; KNL_64-LABEL: test13:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test13:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test13:
+; SKX: # BB#0:
+; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
@@ -276,10 +606,58 @@ define <16 x float> @test13(float* %base
}
; The base pointer is not splat, can't find unform base
-; KNL-LABEL: test14
-; KNL: vgatherqps (,%zmm0)
-; KNL: vgatherqps (,%zmm0)
define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
+; KNL_64-LABEL: test14:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
+; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0
+; KNL_64-NEXT: vmovd %esi, %xmm1
+; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1
+; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1
+; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; KNL_64-NEXT: kshiftrw $8, %k0, %k1
+; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
+; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1}
+; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test14:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
+; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0
+; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
+; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
+; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test14:
+; SKX: # BB#0:
+; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
+; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT: vpbroadcastq %xmm0, %zmm0
+; SKX-NEXT: vmovd %esi, %xmm1
+; SKX-NEXT: vpbroadcastd %xmm1, %ymm1
+; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
+; SKX-NEXT: vpsllq $2, %zmm1, %zmm1
+; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: kshiftrw $8, %k0, %k1
+; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
+; SKX-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1}
+; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm2, %zmm0
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test14:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
+; SKX_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0
+; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
+; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
+; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
+; SKX_32-NEXT: retl
%broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
@@ -290,19 +668,585 @@ define <16 x float> @test14(float* %base
ret <16 x float>%res
}
+declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
+declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
+
+; Gather smaller than existing instruction
+define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
+;
+; KNL_64-LABEL: test15:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2
+; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm0
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test15:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2
+; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm0
+; KNL_32-NEXT: vpandq .LCPI14_0, %zmm0, %zmm0
+; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test15:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovd2m %xmm1, %k1
+; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
+
+ %sext_ind = sext <4 x i32> %ind to <4 x i64>
+ %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
+ %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
+ ret <4 x float>%res
+}
+
+; Gather smaller than existing instruction
+define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
+;
+; KNL_64-LABEL: test16:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
+; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
+; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
+; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test16:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
+; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
+; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT: vpandq .LCPI15_0, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test16:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovd2m %xmm1, %k1
+; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
+; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: retq
+
+ %sext_ind = sext <4 x i32> %ind to <4 x i64>
+ %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
+ %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
+ ret <4 x double>%res
+}
+
+define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
+;
+; KNL_64-LABEL: test17:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
+; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test17:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpandq .LCPI16_0, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test17:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovq2m %xmm1, %k1
+; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
+; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: retq
+
+ %sext_ind = sext <2 x i32> %ind to <2 x i64>
+ %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
+ %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
+ ret <2 x double>%res
+}
+
+declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
+declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
+declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
+declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
+declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
+
+define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
+;
+; KNL_64-LABEL: test18:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test18:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2
+; KNL_32-NEXT: vpandq .LCPI17_0, %zmm2, %zmm2
+; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test18:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovd2m %xmm2, %k1
+; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
+ ret void
+}
+
+define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
+;
+; KNL_64-LABEL: test19:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
+; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
+; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test19:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
+; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
+; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpandq .LCPI18_0, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test19:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovd2m %xmm1, %k1
+; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test19:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpmovd2m %xmm1, %k1
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
+; SKX_32-NEXT: retl
+ %gep = getelementptr double, double* %ptr, <4 x i64> %ind
+ call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
+ ret void
+}
-; KNL-LABEL: test15
-; KNL: kmovw %eax, %k1
-; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; Data type requires widening
+define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
+;
+; KNL_64-LABEL: test20:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_64-NEXT: vmovq %xmm2, %xmm2
+; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test20:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_32-NEXT: vmovq %xmm2, %xmm2
+; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2
+; KNL_32-NEXT: vpandq .LCPI19_0, %zmm2, %zmm2
+; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test20:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovq2m %xmm2, %k0
+; SKX-NEXT: kshiftlw $2, %k0, %k0
+; SKX-NEXT: kshiftrw $2, %k0, %k1
+; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
+ ret void
+}
+
+; Data type requires promotion
+define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
+;
+; KNL_64-LABEL: test21:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test21:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT: vpandq .LCPI20_0, %zmm2, %zmm2
+; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test21:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovq2m %xmm2, %k0
+; SKX-NEXT: kshiftlw $2, %k0, %k0
+; SKX-NEXT: kshiftrw $2, %k0, %k1
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
+ ret void
+}
+
+; The result type requires widening
+declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
+
+define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
+;
+;
+; KNL_64-LABEL: test22:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL_64-NEXT: vmovq %xmm1, %xmm1
+; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
+; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test22:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL_32-NEXT: vmovq %xmm1, %xmm1
+; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_32-NEXT: vpandq .LCPI21_0, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test22:
+; SKX: # BB#0:
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX-NEXT: vpmovq2m %xmm1, %k0
+; SKX-NEXT: kshiftlw $2, %k0, %k0
+; SKX-NEXT: kshiftrw $2, %k0, %k1
+; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
+; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: retq
+ %sext_ind = sext <2 x i32> %ind to <2 x i64>
+ %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
+ %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
+ ret <2 x float>%res
+}
+
+declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
+declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
+
+define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
+;
+; KNL_64-LABEL: test23:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
+; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test23:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpandq .LCPI22_0, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test23:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovq2m %xmm1, %k1
+; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
+; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: retq
+ %sext_ind = sext <2 x i32> %ind to <2 x i64>
+ %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
+ %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
+ ret <2 x i32>%res
+}
+
+define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
+;
+;
+; KNL_64-LABEL: test24:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: movb $3, %al
+; KNL_64-NEXT: movzbl %al, %eax
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test24:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; KNL_32-NEXT: vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1
+; KNL_32-NEXT: vpandq .LCPI23_1, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test24:
+; SKX: # BB#0:
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
+ %sext_ind = sext <2 x i32> %ind to <2 x i64>
+ %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
+ %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+ ret <2 x i32>%res
+}
-; SCALAR-LABEL: test15
-; SCALAR: extractelement <16 x float*>
+define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
+;
+; KNL_64-LABEL: test25:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
+; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test25:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpandq .LCPI24_0, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test25:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovq2m %xmm1, %k1
+; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
+; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: retq
+ %sext_ind = sext <2 x i32> %ind to <2 x i64>
+ %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
+ %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
+ ret <2 x i64>%res
+}
+
+define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
+;
+; KNL_64-LABEL: test26:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: movb $3, %al
+; KNL_64-NEXT: movzbl %al, %eax
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test26:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; KNL_32-NEXT: vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2
+; KNL_32-NEXT: vpandq .LCPI25_1, %zmm2, %zmm2
+; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test26:
+; SKX: # BB#0:
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
+ %sext_ind = sext <2 x i32> %ind to <2 x i64>
+ %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
+ %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
+ ret <2 x i64>%res
+}
+
+; Result type requires widening; all-ones mask
+define <2 x float> @test27(float* %base, <2 x i32> %ind) {
+;
+; KNL_64-LABEL: test27:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1
+; KNL_64-NEXT: movb $3, %al
+; KNL_64-NEXT: movzbl %al, %eax
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test27:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
+; KNL_32-NEXT: movb $3, %cl
+; KNL_32-NEXT: movzbl %cl, %ecx
+; KNL_32-NEXT: kmovw %ecx, %k1
+; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test27:
+; SKX: # BB#0:
+; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SKX-NEXT: movb $3, %al
+; SKX-NEXT: kmovb %eax, %k1
+; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
+; SKX-NEXT: retq
+ %sext_ind = sext <2 x i32> %ind to <2 x i64>
+ %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
+ %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
+ ret <2 x float>%res
+}
+
+; Data type requires promotion, mask is all-ones
+define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
+;
+;
+; KNL_64-LABEL: test28:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT: movb $3, %al
+; KNL_64-NEXT: movzbl %al, %eax
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test28:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; KNL_32-NEXT: vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2
+; KNL_32-NEXT: vpandq .LCPI27_1, %zmm2, %zmm2
+; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test28:
+; SKX: # BB#0:
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX-NEXT: movb $3, %al
+; SKX-NEXT: kmovb %eax, %k1
+; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
+ ret void
+}
+
+
+; SCALAR-LABEL: test29
+; SCALAR: extractelement <16 x float*>
; SCALAR-NEXT: load float
; SCALAR-NEXT: insertelement <16 x float>
; SCALAR-NEXT: extractelement <16 x float*>
; SCALAR-NEXT: load float
-define <16 x float> @test15(float* %base, <16 x i32> %ind) {
+define <16 x float> @test29(float* %base, <16 x i32> %ind) {
+; KNL_64-LABEL: test29:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: movw $44, %ax
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test29:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: movw $44, %cx
+; KNL_32-NEXT: kmovw %ecx, %k1
+; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test29:
+; SKX: # BB#0:
+; SKX-NEXT: movw $44, %ax
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
@@ -316,14 +1260,136 @@ define <16 x float> @test15(float* %base
; Check non-power-of-2 case. It should be scalarized.
declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
-; KNL-LABEL: test16
-; KNL: testb
-; KNL: je
-; KNL: testb
-; KNL: je
-; KNL: testb
-; KNL: je
-define <3 x i32> @test16(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
+define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
+; KNL_64-LABEL: test30:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: andl $1, %edx
+; KNL_64-NEXT: kmovw %edx, %k1
+; KNL_64-NEXT: andl $1, %esi
+; KNL_64-NEXT: kmovw %esi, %k2
+; KNL_64-NEXT: movl %edi, %eax
+; KNL_64-NEXT: andl $1, %eax
+; KNL_64-NEXT: kmovw %eax, %k0
+; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
+; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
+; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; KNL_64-NEXT: # implicit-def: %XMM0
+; KNL_64-NEXT: testb $1, %dil
+; KNL_64-NEXT: je .LBB29_2
+; KNL_64-NEXT: # BB#1: # %cond.load
+; KNL_64-NEXT: vmovq %xmm1, %rax
+; KNL_64-NEXT: vmovd (%rax), %xmm0
+; KNL_64-NEXT: .LBB29_2: # %else
+; KNL_64-NEXT: kmovw %k2, %eax
+; KNL_64-NEXT: movl %eax, %ecx
+; KNL_64-NEXT: andl $1, %ecx
+; KNL_64-NEXT: testb %cl, %cl
+; KNL_64-NEXT: je .LBB29_4
+; KNL_64-NEXT: # BB#3: # %cond.load1
+; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx
+; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0
+; KNL_64-NEXT: .LBB29_4: # %else2
+; KNL_64-NEXT: kmovw %k1, %ecx
+; KNL_64-NEXT: movl %ecx, %edx
+; KNL_64-NEXT: andl $1, %edx
+; KNL_64-NEXT: testb %dl, %dl
+; KNL_64-NEXT: je .LBB29_6
+; KNL_64-NEXT: # BB#5: # %cond.load4
+; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1
+; KNL_64-NEXT: vmovq %xmm1, %rdx
+; KNL_64-NEXT: vpinsrd $2, (%rdx), %xmm0, %xmm0
+; KNL_64-NEXT: .LBB29_6: # %else5
+; KNL_64-NEXT: kmovw %k0, %edx
+; KNL_64-NEXT: vmovd %edx, %xmm1
+; KNL_64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; KNL_64-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_64-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test30:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: andl $1, %eax
+; KNL_32-NEXT: kmovw %eax, %k1
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: andl $1, %eax
+; KNL_32-NEXT: kmovw %eax, %k2
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: movl %eax, %ecx
+; KNL_32-NEXT: andl $1, %ecx
+; KNL_32-NEXT: kmovw %ecx, %k0
+; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
+; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; KNL_32-NEXT: # implicit-def: %XMM0
+; KNL_32-NEXT: testb $1, %al
+; KNL_32-NEXT: je .LBB29_2
+; KNL_32-NEXT: # BB#1: # %cond.load
+; KNL_32-NEXT: vmovd %xmm1, %eax
+; KNL_32-NEXT: vmovd (%eax), %xmm0
+; KNL_32-NEXT: .LBB29_2: # %else
+; KNL_32-NEXT: kmovw %k2, %eax
+; KNL_32-NEXT: movl %eax, %ecx
+; KNL_32-NEXT: andl $1, %ecx
+; KNL_32-NEXT: testb %cl, %cl
+; KNL_32-NEXT: je .LBB29_4
+; KNL_32-NEXT: # BB#3: # %cond.load1
+; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
+; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
+; KNL_32-NEXT: .LBB29_4: # %else2
+; KNL_32-NEXT: kmovw %k1, %ecx
+; KNL_32-NEXT: movl %ecx, %edx
+; KNL_32-NEXT: andl $1, %edx
+; KNL_32-NEXT: testb %dl, %dl
+; KNL_32-NEXT: je .LBB29_6
+; KNL_32-NEXT: # BB#5: # %cond.load4
+; KNL_32-NEXT: vpextrd $2, %xmm1, %edx
+; KNL_32-NEXT: vpinsrd $2, (%edx), %xmm0, %xmm0
+; KNL_32-NEXT: .LBB29_6: # %else5
+; KNL_32-NEXT: kmovw %k0, %edx
+; KNL_32-NEXT: vmovd %edx, %xmm1
+; KNL_32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; KNL_32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_32-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test30:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovd2m %xmm2, %k1
+; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
+; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
+; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
+; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SKX-NEXT: # implicit-def: %XMM0
+; SKX-NEXT: andb $1, %al
+; SKX-NEXT: je .LBB29_2
+; SKX-NEXT: # BB#1: # %cond.load
+; SKX-NEXT: vmovq %xmm1, %rax
+; SKX-NEXT: vmovd (%rax), %xmm0
+; SKX-NEXT: .LBB29_2: # %else
+; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
+; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SKX-NEXT: andb $1, %al
+; SKX-NEXT: je .LBB29_4
+; SKX-NEXT: # BB#3: # %cond.load1
+; SKX-NEXT: vpextrq $1, %xmm1, %rax
+; SKX-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0
+; SKX-NEXT: .LBB29_4: # %else2
+; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
+; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SKX-NEXT: andb $1, %al
+; SKX-NEXT: je .LBB29_6
+; SKX-NEXT: # BB#5: # %cond.load4
+; SKX-NEXT: vextracti128 $1, %ymm1, %xmm1
+; SKX-NEXT: vmovq %xmm1, %rax
+; SKX-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0
+; SKX-NEXT: .LBB29_6: # %else5
+; SKX-NEXT: vmovdqa32 %xmm0, %xmm3 {%k1}
+; SKX-NEXT: vmovaps %zmm3, %zmm0
+; SKX-NEXT: retq
+
%sext_ind = sext <3 x i32> %ind to <3 x i64>
%gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
%res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
@@ -332,11 +1398,405 @@ define <3 x i32> @test16(<3 x i32*> %bas
declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
-; KNL-LABEL: test17
+; KNL-LABEL: test31
; KNL: vpgatherqq
; KNL: vpgatherqq
-define <16 x float*> @test17(<16 x float**> %ptrs) {
+define <16 x float*> @test31(<16 x float**> %ptrs) {
+; KNL_64-LABEL: test31:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kxnorw %k1, %k1, %k1
+; KNL_64-NEXT: kxnorw %k2, %k2, %k2
+; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
+; KNL_64-NEXT: kshiftrw $8, %k1, %k1
+; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
+; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: vmovaps %zmm3, %zmm1
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test31:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: kxnorw %k1, %k1, %k1
+; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test31:
+; SKX: # BB#0:
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: kxnorw %k2, %k2, %k2
+; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
+; SKX-NEXT: kshiftrw $8, %k1, %k1
+; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
+; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: vmovaps %zmm3, %zmm1
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test31:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: kxnorw %k1, %k1, %k1
+; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
+; SKX_32-NEXT: vmovaps %zmm1, %zmm0
+; SKX_32-NEXT: retl
%res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
ret <16 x float*>%res
}
+
+define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
+; KNL_64-LABEL: test_gather_16i32:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
+; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
+; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_gather_16i32:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_gather_16i32:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
+; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
+; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test_gather_16i32:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1
+; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
+; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: retl
+ %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
+ ret <16 x i32> %res
+}
+define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
+; KNL_64-LABEL: test_gather_16i64:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
+; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
+; KNL_64-NEXT: vmovaps %zmm3, %zmm0
+; KNL_64-NEXT: vmovaps %zmm4, %zmm1
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_gather_16i64:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: pushl %ebp
+; KNL_32-NEXT: .Ltmp0:
+; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: .Ltmp1:
+; KNL_32-NEXT: .cfi_offset %ebp, -8
+; KNL_32-NEXT: movl %esp, %ebp
+; KNL_32-NEXT: .Ltmp2:
+; KNL_32-NEXT: .cfi_def_cfa_register %ebp
+; KNL_32-NEXT: andl $-64, %esp
+; KNL_32-NEXT: subl $64, %esp
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI32_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
+; KNL_32-NEXT: kshiftrw $8, %k1, %k2
+; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
+; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: movl %ebp, %esp
+; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_gather_16i64:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
+; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
+; SKX-NEXT: vmovaps %zmm3, %zmm0
+; SKX-NEXT: vmovaps %zmm4, %zmm1
+; SKX-NEXT: retq
+ %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
+ ret <16 x i64> %res
+}
+declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
+define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
+; KNL_64-LABEL: test_gather_16f32:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
+; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
+; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_gather_16f32:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI33_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_gather_16f32:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
+; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
+; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT: retq
+ %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
+ ret <16 x float> %res
+}
+define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
+; KNL_64-LABEL: test_gather_16f64:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
+; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
+; KNL_64-NEXT: vmovaps %zmm3, %zmm0
+; KNL_64-NEXT: vmovaps %zmm4, %zmm1
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_gather_16f64:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: pushl %ebp
+; KNL_32-NEXT: .Ltmp3:
+; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: .Ltmp4:
+; KNL_32-NEXT: .cfi_offset %ebp, -8
+; KNL_32-NEXT: movl %esp, %ebp
+; KNL_32-NEXT: .Ltmp5:
+; KNL_32-NEXT: .cfi_def_cfa_register %ebp
+; KNL_32-NEXT: andl $-64, %esp
+; KNL_32-NEXT: subl $64, %esp
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI34_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
+; KNL_32-NEXT: kshiftrw $8, %k1, %k2
+; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
+; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: movl %ebp, %esp
+; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_gather_16f64:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
+; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
+; SKX-NEXT: vmovaps %zmm3, %zmm0
+; SKX-NEXT: vmovaps %zmm4, %zmm1
+; SKX-NEXT: retq
+ %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
+ ret <16 x double> %res
+}
+declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
+define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
+; KNL_64-LABEL: test_scatter_16i32:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
+; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0
+; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_scatter_16i32:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_scatter_16i32:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
+; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0
+; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test_scatter_16i32:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1
+; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
+; SKX_32-NEXT: retl
+ call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
+ ret void
+}
+define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
+; KNL_64-LABEL: test_scatter_16i64:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
+; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_scatter_16i64:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: pushl %ebp
+; KNL_32-NEXT: .Ltmp6:
+; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: .Ltmp7:
+; KNL_32-NEXT: .cfi_offset %ebp, -8
+; KNL_32-NEXT: movl %esp, %ebp
+; KNL_32-NEXT: .Ltmp8:
+; KNL_32-NEXT: .cfi_def_cfa_register %ebp
+; KNL_32-NEXT: andl $-64, %esp
+; KNL_32-NEXT: subl $64, %esp
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI36_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
+; KNL_32-NEXT: kshiftrw $8, %k1, %k2
+; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
+; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
+; KNL_32-NEXT: movl %ebp, %esp
+; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_scatter_16i64:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
+; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
+; SKX-NEXT: retq
+ call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
+ ret void
+}
+declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
+define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
+; KNL_64-LABEL: test_scatter_16f32:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
+; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0
+; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_scatter_16f32:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI37_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_scatter_16f32:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
+; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0
+; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
+; SKX-NEXT: retq
+ call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
+ ret void
+}
+declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
+define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
+; KNL_64-LABEL: test_scatter_16f64:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
+; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_scatter_16f64:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: pushl %ebp
+; KNL_32-NEXT: .Ltmp9:
+; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: .Ltmp10:
+; KNL_32-NEXT: .cfi_offset %ebp, -8
+; KNL_32-NEXT: movl %esp, %ebp
+; KNL_32-NEXT: .Ltmp11:
+; KNL_32-NEXT: .cfi_def_cfa_register %ebp
+; KNL_32-NEXT: andl $-64, %esp
+; KNL_32-NEXT: subl $64, %esp
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI38_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
+; KNL_32-NEXT: kshiftrw $8, %k1, %k2
+; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
+; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
+; KNL_32-NEXT: movl %ebp, %esp
+; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_scatter_16f64:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
+; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
+; SKX-NEXT: retq
+ call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
+ ret void
+}
+declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
Modified: llvm/trunk/test/CodeGen/X86/masked_memop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_memop.ll?rev=255629&r1=255628&r2=255629&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_memop.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_memop.ll Tue Dec 15 02:40:41 2015
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=AVX512
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
-; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=AVX_SCALAR
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=skx < %s | FileCheck %s -check-prefix=SKX
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s --check-prefix=AVX512
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX2
+; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s --check-prefix=AVX_SCALAR
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=skx < %s | FileCheck %s --check-prefix=SKX
; AVX512-LABEL: test1
; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
@@ -274,6 +274,15 @@ define <2 x i32> @test17(<2 x i32> %trig
; AVX2-NOT: blend
; AVX2: ret
define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
+; SKX-LABEL: test18:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
+; SKX-NEXT: kshiftlw $2, %k0, %k0
+; SKX-NEXT: kshiftrw $2, %k0, %k1
+; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
ret <2 x float> %res
@@ -363,3 +372,77 @@ define <16 x %mystruct*> @test24(<16 x i
%res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer)
ret <16 x %mystruct*> %res
}
+
+define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
+; SKX-LABEL: test_store_16i64:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
+; SKX-NEXT: kshiftrw $8, %k1, %k1
+; SKX-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask)
+ ret void
+}
+declare void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask)
+define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) {
+; SKX-LABEL: test_store_16f64:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vmovupd %zmm1, (%rdi) {%k1}
+; SKX-NEXT: kshiftrw $8, %k1, %k1
+; SKX-NEXT: vmovupd %zmm2, 64(%rdi) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask)
+ ret void
+}
+declare void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask)
+define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
+; SKX-LABEL: test_load_16i64:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
+; SKX-NEXT: kshiftrw $8, %k1, %k1
+; SKX-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: vmovaps %zmm2, %zmm1
+; SKX-NEXT: retq
+ %res = call <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
+ ret <16 x i64> %res
+}
+declare <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
+define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) {
+; SKX-LABEL: test_load_16f64:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1}
+; SKX-NEXT: kshiftrw $8, %k1, %k1
+; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: vmovaps %zmm2, %zmm1
+; SKX-NEXT: retq
+ %res = call <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
+ ret <16 x double> %res
+}
+declare <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
+
+define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) {
+; SKX-LABEL: test_load_32f64:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmovb2m %ymm0, %k1
+; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1}
+; SKX-NEXT: kshiftrd $16, %k1, %k2
+; SKX-NEXT: vmovupd 128(%rdi), %zmm3 {%k2}
+; SKX-NEXT: kshiftrw $8, %k1, %k1
+; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
+; SKX-NEXT: kshiftrw $8, %k2, %k1
+; SKX-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: vmovaps %zmm2, %zmm1
+; SKX-NEXT: vmovaps %zmm3, %zmm2
+; SKX-NEXT: vmovaps %zmm4, %zmm3
+; SKX-NEXT: retq
+ %res = call <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
+ ret <32 x double> %res
+}
+declare <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)
More information about the llvm-commits
mailing list