[llvm-branch-commits] [llvm] cdb692e - [X86] Add X86ISD::SUBV_BROADCAST_LOAD and begin removing X86ISD::SUBV_BROADCAST (PR38969)
Simon Pilgrim via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Dec 17 02:29:59 PST 2020
Author: Simon Pilgrim
Date: 2020-12-17T10:25:25Z
New Revision: cdb692ee0c6745ea008ee6cc00fe1e65021516bb
URL: https://github.com/llvm/llvm-project/commit/cdb692ee0c6745ea008ee6cc00fe1e65021516bb
DIFF: https://github.com/llvm/llvm-project/commit/cdb692ee0c6745ea008ee6cc00fe1e65021516bb.diff
LOG: [X86] Add X86ISD::SUBV_BROADCAST_LOAD and begin removing X86ISD::SUBV_BROADCAST (PR38969)
Subvector broadcasts are only load instructions, yet X86ISD::SUBV_BROADCAST treats them more generally, requiring a lot of fallback tablegen patterns.
This initial patch replaces constant vector lowering inside lowerBuildVectorAsBroadcast with direct X86ISD::SUBV_BROADCAST_LOAD loads which helps us merge a number of equivalent loads/broadcasts.
As well as general plumbing/analysis additions for SUBV_BROADCAST_LOAD, I needed to wrap SelectionDAG::makeEquivalentMemoryOrdering so it can handle result chains from non generic LoadSDNode nodes.
Later patches will continue to replace X86ISD::SUBV_BROADCAST usage.
Differential Revision: https://reviews.llvm.org/D92645
Added:
Modified:
llvm/include/llvm/CodeGen/SelectionDAG.h
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86ISelLowering.h
llvm/lib/Target/X86/X86InstrAVX512.td
llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
llvm/lib/Target/X86/X86InstrSSE.td
llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
llvm/test/CodeGen/X86/subvector-broadcast.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 4ec870bb3f9b..fbaa1f0e974f 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1591,7 +1591,14 @@ class SelectionDAG {
/// chain to the token factor. This ensures that the new memory node will have
/// the same relative memory dependency position as the old load. Returns the
/// new merged load chain.
- SDValue makeEquivalentMemoryOrdering(LoadSDNode *Old, SDValue New);
+ SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain);
+
+ /// If an existing load has uses of its chain, create a token factor node with
+ /// that chain and the new memory node's chain and update users of the old
+ /// chain to the token factor. This ensures that the new memory node will have
+ /// the same relative memory dependency position as the old load. Returns the
+ /// new merged load chain.
+ SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, SDValue NewMemOp);
/// Topological-sort the AllNodes list and a
/// assign a unique node id for each node in the DAG based on their
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 83e8637b3840..b2c748167577 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8966,25 +8966,32 @@ void SelectionDAG::AddDbgLabel(SDDbgLabel *DB) {
DbgInfo->add(DB);
}
-SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
- SDValue NewMemOp) {
- assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node");
+SDValue SelectionDAG::makeEquivalentMemoryOrdering(SDValue OldChain,
+ SDValue NewMemOpChain) {
+ assert(isa<MemSDNode>(NewMemOpChain) && "Expected a memop node");
+ assert(NewMemOpChain.getValueType() == MVT::Other && "Expected a token VT");
// The new memory operation must have the same position as the old load in
// terms of memory dependency. Create a TokenFactor for the old load and new
// memory operation and update uses of the old load's output chain to use that
// TokenFactor.
- SDValue OldChain = SDValue(OldLoad, 1);
- SDValue NewChain = SDValue(NewMemOp.getNode(), 1);
- if (OldChain == NewChain || !OldLoad->hasAnyUseOfValue(1))
- return NewChain;
+ if (OldChain == NewMemOpChain || OldChain.use_empty())
+ return NewMemOpChain;
- SDValue TokenFactor =
- getNode(ISD::TokenFactor, SDLoc(OldLoad), MVT::Other, OldChain, NewChain);
+ SDValue TokenFactor = getNode(ISD::TokenFactor, SDLoc(OldChain), MVT::Other,
+ OldChain, NewMemOpChain);
ReplaceAllUsesOfValueWith(OldChain, TokenFactor);
- UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewChain);
+ UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewMemOpChain);
return TokenFactor;
}
+SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
+ SDValue NewMemOp) {
+ assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node");
+ SDValue OldChain = SDValue(OldLoad, 1);
+ SDValue NewMemOpChain = NewMemOp.getValue(1);
+ return makeEquivalentMemoryOrdering(OldChain, NewMemOpChain);
+}
+
SDValue SelectionDAG::getSymbolFunctionGlobalAddress(SDValue Op,
Function **OutFunction) {
assert(isa<ExternalSymbolSDNode>(Op) && "Node should be an ExternalSymbol");
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a1c692daf593..5264014f2b8f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6644,15 +6644,30 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
}
// Extract constant bits from a subvector broadcast.
- if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
- SmallVector<APInt, 16> SubEltBits;
- if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
- UndefElts, SubEltBits, AllowWholeUndefs,
- AllowPartialUndefs)) {
- UndefElts = APInt::getSplat(NumElts, UndefElts);
- while (EltBits.size() < NumElts)
- EltBits.append(SubEltBits.begin(), SubEltBits.end());
- return true;
+ if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ SDValue Ptr = MemIntr->getBasePtr();
+ if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
+ Type *CstTy = Cst->getType();
+ unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
+ if (!CstTy->isVectorTy() || (SizeInBits % CstSizeInBits) != 0)
+ return false;
+ unsigned SubEltSizeInBits = CstTy->getScalarSizeInBits();
+ unsigned NumSubElts = CstSizeInBits / SubEltSizeInBits;
+ unsigned NumSubVecs = SizeInBits / CstSizeInBits;
+ APInt UndefSubElts(NumSubElts, 0);
+ SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
+ APInt(SubEltSizeInBits, 0));
+ for (unsigned i = 0; i != NumSubElts; ++i) {
+ if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
+ UndefSubElts, i))
+ return false;
+ for (unsigned j = 1; j != NumSubVecs; ++j)
+ SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
+ }
+ UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
+ UndefSubElts);
+ return CastBitData(UndefSubElts, SubEltBits);
}
}
@@ -8802,17 +8817,19 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
}
if (SplatBitSize > 64) {
// Load the vector of constants and broadcast it.
- MVT CVT = VT.getScalarType();
Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
*Ctx);
SDValue VCP = DAG.getConstantPool(VecC, PVT);
unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
+ MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
- Ld = DAG.getLoad(
- MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- Alignment);
- return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {DAG.getEntryNode(), VCP};
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+ return DAG.getMemIntrinsicNode(
+ X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
+ MachineMemOperand::MOLoad);
}
}
}
@@ -30929,6 +30946,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(VBROADCAST_LOAD)
NODE_NAME_CASE(VBROADCASTM)
NODE_NAME_CASE(SUBV_BROADCAST)
+ NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
NODE_NAME_CASE(VPERMILPV)
NODE_NAME_CASE(VPERMILPI)
NODE_NAME_CASE(VPERM2X128)
@@ -38056,6 +38074,34 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
}
return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
TLO.DAG, DL, ExtSizeInBits));
+ }
+ case X86ISD::SUBV_BROADCAST_LOAD: {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ EVT MemVT = MemIntr->getMemoryVT();
+ if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
+ SDLoc DL(Op);
+ SDValue Ld =
+ TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
+ MemIntr->getBasePtr(), MemIntr->getMemOperand());
+ TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
+ Ld.getValue(1));
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
+ TLO.DAG, DL, ExtSizeInBits));
+ } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
+ SDLoc DL(Op);
+ EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
+ ExtSizeInBits / VT.getScalarSizeInBits());
+ SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
+ SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
+ SDValue Bcst =
+ TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys,
+ Ops, MemVT, MemIntr->getMemOperand());
+ TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
+ Bcst.getValue(1));
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
+ TLO.DAG, DL, ExtSizeInBits));
+ }
+ break;
}
// Byte shifts by immediate.
case X86ISD::VSHLDQ:
@@ -44606,6 +44652,29 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
}
}
+ // If we also broadcast this as a subvector to a wider type, then just extract
+ // the lowest subvector.
+ if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
+ (RegVT.is128BitVector() || RegVT.is256BitVector())) {
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue Chain = Ld->getChain();
+ for (SDNode *User : Ptr->uses()) {
+ if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
+ cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
+ cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
+ cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
+ MemVT.getSizeInBits() &&
+ !User->hasAnyUseOfValue(1) &&
+ User->getValueSizeInBits(0).getFixedSize() >
+ RegVT.getFixedSizeInBits()) {
+ SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
+ RegVT.getSizeInBits());
+ Extract = DAG.getBitcast(RegVT, Extract);
+ return DCI.CombineTo(N, Extract, SDValue(User, 1));
+ }
+ }
+ }
+
// Cast ptr32 and ptr64 pointers to the default address space before a load.
unsigned AddrSpace = Ld->getAddressSpace();
if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
@@ -49321,7 +49390,8 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
// extract the lowest subvector instead which should allow
// SimplifyDemandedVectorElts do more simplifications.
if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
- InVec.getOpcode() == X86ISD::VBROADCAST_LOAD))
+ InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
+ InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD))
return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
// If we're extracting a broadcasted subvector, just use the source.
@@ -49687,11 +49757,15 @@ static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
}
-// Try to find a larger VBROADCAST_LOAD that we can extract from. Limit this to
-// cases where the loads have the same input chain and the output chains are
-// unused. This avoids any memory ordering issues.
-static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
+// from. Limit this to cases where the loads have the same input chain and the
+// output chains are unused. This avoids any memory ordering issues.
+static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
+ N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
+ "Unknown broadcast load type");
+
// Only do this if the chain result is unused.
if (N->hasAnyUseOfValue(1))
return SDValue();
@@ -49706,7 +49780,7 @@ static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
// Look at other users of our base pointer and try to find a wider broadcast.
// The input chain and the size of the memory VT must match.
for (SDNode *User : Ptr->uses())
- if (User != N && User->getOpcode() == X86ISD::VBROADCAST_LOAD &&
+ if (User != N && User->getOpcode() == N->getOpcode() &&
cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
@@ -49963,7 +50037,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::STRICT_FP_EXTEND:
case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
- case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI);
+ case X86ISD::VBROADCAST_LOAD:
+ case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 4ed78e1e97a4..e3f9ce8d698b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -776,9 +776,12 @@ namespace llvm {
// extract_vector_elt, store.
VEXTRACT_STORE,
- // scalar broadcast from memory
+ // scalar broadcast from memory.
VBROADCAST_LOAD,
+ // subvector broadcast from memory.
+ SUBV_BROADCAST_LOAD,
+
// Store FP control world into i16 memory.
FNSTCW16m,
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index afe2176548fa..c6367a07137e 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -1456,6 +1456,32 @@ defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4",
EVEX_V512, EVEX_CD8<64, CD8VT4>;
let Predicates = [HasAVX512] in {
+def : Pat<(v8f64 (X86SubVBroadcastld256 addr:$src)),
+ (VBROADCASTF64X4rm addr:$src)>;
+def : Pat<(v16f32 (X86SubVBroadcastld256 addr:$src)),
+ (VBROADCASTF64X4rm addr:$src)>;
+def : Pat<(v8i64 (X86SubVBroadcastld256 addr:$src)),
+ (VBROADCASTI64X4rm addr:$src)>;
+def : Pat<(v16i32 (X86SubVBroadcastld256 addr:$src)),
+ (VBROADCASTI64X4rm addr:$src)>;
+def : Pat<(v32i16 (X86SubVBroadcastld256 addr:$src)),
+ (VBROADCASTI64X4rm addr:$src)>;
+def : Pat<(v64i8 (X86SubVBroadcastld256 addr:$src)),
+ (VBROADCASTI64X4rm addr:$src)>;
+
+def : Pat<(v8f64 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF32X4rm addr:$src)>;
+def : Pat<(v16f32 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF32X4rm addr:$src)>;
+def : Pat<(v8i64 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4rm addr:$src)>;
+def : Pat<(v16i32 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4rm addr:$src)>;
+def : Pat<(v32i16 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4rm addr:$src)>;
+def : Pat<(v64i8 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4rm addr:$src)>;
+
def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))),
(VBROADCASTF64X4rm addr:$src)>;
def : Pat<(v16i32 (X86SubVBroadcast (loadv8i32 addr:$src))),
@@ -1539,6 +1565,19 @@ defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
v8f32x_info, v4f32x_info>,
EVEX_V256, EVEX_CD8<32, CD8VT4>;
+def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF32X4Z256rm addr:$src)>;
+def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF32X4Z256rm addr:$src)>;
+def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4Z256rm addr:$src)>;
+def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4Z256rm addr:$src)>;
+def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4Z256rm addr:$src)>;
+def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4Z256rm addr:$src)>;
+
def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
(VBROADCASTF32X4Z256rm addr:$src)>;
def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 9cea4cbc7b8a..98380b47cd89 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -109,6 +109,8 @@ def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def X86VBroadcastld : SDNode<"X86ISD::VBROADCAST_LOAD", SDTLoad,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86SubVBroadcastld : SDNode<"X86ISD::SUBV_BROADCAST_LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisInt<0>, SDTCisInt<1>,
@@ -965,6 +967,16 @@ def X86VBroadcastld64 : PatFrag<(ops node:$src),
return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
}]>;
+def X86SubVBroadcastld128 : PatFrag<(ops node:$src),
+ (X86SubVBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 16;
+}]>;
+
+def X86SubVBroadcastld256 : PatFrag<(ops node:$src),
+ (X86SubVBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 32;
+}]>;
+
// Scalar SSE intrinsic fragments to match several
diff erent types of loads.
// Used by scalar SSE intrinsic instructions which have 128 bit types, but
// only load a single element.
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 0fac3b6f1761..d4fdac0cee0e 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -7016,6 +7016,11 @@ def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF128 addr:$src)>;
+
def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
(VBROADCASTF128 addr:$src)>;
def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
@@ -7025,6 +7030,15 @@ def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
// NOTE: We're using FP instructions here, but execution domain fixing can
// convert to integer when profitable.
let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF128 addr:$src)>;
+
def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
(VBROADCASTF128 addr:$src)>;
def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
index 000773b2af7b..4840369f7d93 100644
--- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
+++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
@@ -493,16 +493,16 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
; AVX-LABEL: f64xi8_i128:
; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3
+; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3
+; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: retl
@@ -527,16 +527,16 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
;
; AVX-64-LABEL: f64xi8_i128:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
-; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1
-; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
-; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0
-; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2
+; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-64-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3
+; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3
+; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: retq
@@ -971,16 +971,16 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
; AVX-LABEL: f32xi16_i128:
; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7]
-; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3
+; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3
+; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: retl
@@ -1005,16 +1005,16 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
;
; AVX-64-LABEL: f32xi16_i128:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7]
-; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
-; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1
-; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
-; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0
-; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2
+; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-64-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3
+; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3
+; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: retq
@@ -1307,16 +1307,16 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
; AVX-LABEL: f16xi32_i128:
; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
-; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
+; AVX-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX-NEXT: vpaddd %xmm2, %xmm3, %xmm3
+; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX-NEXT: vpaddd %xmm2, %xmm3, %xmm3
+; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: retl
@@ -1341,16 +1341,16 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
;
; AVX-64-LABEL: f16xi32_i128:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
-; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX-64-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0
-; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2
+; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
+; AVX-64-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX-64-NEXT: vpaddd %xmm2, %xmm3, %xmm3
+; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX-64-NEXT: vpaddd %xmm2, %xmm3, %xmm3
+; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: retq
@@ -1423,16 +1423,16 @@ define <4 x i64> @f4xi64_i128(<4 x i64> %a) {
define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
; AVX-LABEL: f8xi64_i128:
; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,1,0]
-; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpaddq %xmm3, %xmm1, %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpaddq %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0]
+; AVX-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm3
+; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm3
+; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll
index bde44e6072b3..e6de535eceef 100644
--- a/llvm/test/CodeGen/X86/subvector-broadcast.ll
+++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll
@@ -770,9 +770,9 @@ define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
;
; X86-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
; X86-AVX512: # %bb.0: # %entry
-; X86-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,2,0,3,0,4,0]
+; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0]
+; X86-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
; X86-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; X86-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
; X86-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1
; X86-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1
; X86-AVX512-NEXT: vmovdqu %ymm0, ga4
@@ -821,9 +821,9 @@ define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
;
; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
; X64-AVX512: # %bb.0: # %entry
-; X64-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,3,4]
+; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,2,3,4,1,2,3,4]
+; X64-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
; X64-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; X64-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
; X64-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1
; X64-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1
; X64-AVX512-NEXT: vmovdqu %ymm0, {{.*}}(%rip)
@@ -860,9 +860,9 @@ define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b)
;
; X86-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
; X86-AVX512: # %bb.0: # %entry
-; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; X86-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
; X86-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0
-; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
; X86-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1
; X86-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1
; X86-AVX512-NEXT: vmovupd %ymm0, ga2
@@ -886,9 +886,9 @@ define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b)
;
; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
; X64-AVX512: # %bb.0: # %entry
-; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; X64-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
; X64-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0
-; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
; X64-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1
; X64-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1
; X64-AVX512-NEXT: vmovupd %ymm0, {{.*}}(%rip)
@@ -915,23 +915,23 @@ define void @fallback_broadcast_v4i32_v8i32_v16i32(<4 x i32> %a, <8 x i32> %b, <
; X86-AVX1-NEXT: movl %esp, %ebp
; X86-AVX1-NEXT: andl $-32, %esp
; X86-AVX1-NEXT: subl $32, %esp
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,3,4]
+; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4]
+; X86-AVX1-NEXT: # ymm3 = mem[0,1,0,1]
; X86-AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; X86-AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4
; X86-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm4
-; X86-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
-; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; X86-AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm5
+; X86-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
+; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; X86-AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4
; X86-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; X86-AVX1-NEXT: vpaddd 8(%ebp), %xmm3, %xmm5
-; X86-AVX1-NEXT: vpaddd 24(%ebp), %xmm3, %xmm3
-; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; X86-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
-; X86-AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; X86-AVX1-NEXT: vpaddd 8(%ebp), %xmm3, %xmm4
+; X86-AVX1-NEXT: vpaddd 24(%ebp), %xmm3, %xmm5
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; X86-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vandps %ymm3, %ymm4, %ymm3
; X86-AVX1-NEXT: vmovdqu %xmm0, ha4
; X86-AVX1-NEXT: vmovups %ymm1, hb4
; X86-AVX1-NEXT: vmovups %ymm3, hc4+32
@@ -947,13 +947,13 @@ define void @fallback_broadcast_v4i32_v8i32_v16i32(<4 x i32> %a, <8 x i32> %b, <
; X86-AVX2-NEXT: movl %esp, %ebp
; X86-AVX2-NEXT: andl $-32, %esp
; X86-AVX2-NEXT: subl $32, %esp
-; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,3,4]
+; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4]
+; X86-AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; X86-AVX2-NEXT: vpaddd %xmm3, %xmm0, %xmm0
-; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3
; X86-AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; X86-AVX2-NEXT: vpaddd 8(%ebp), %ymm3, %ymm4
; X86-AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2
+; X86-AVX2-NEXT: vpaddd 8(%ebp), %ymm3, %ymm4
; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
; X86-AVX2-NEXT: vpand %ymm3, %ymm4, %ymm3
; X86-AVX2-NEXT: vmovdqu %xmm0, ha4
@@ -967,12 +967,11 @@ define void @fallback_broadcast_v4i32_v8i32_v16i32(<4 x i32> %a, <8 x i32> %b, <
;
; X86-AVX512-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
; X86-AVX512: # %bb.0: # %entry
-; X86-AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,3,4]
+; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4]
+; X86-AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X86-AVX512-NEXT: vpaddd %xmm3, %xmm0, %xmm0
-; X86-AVX512-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm4
-; X86-AVX512-NEXT: vpaddd %ymm4, %ymm1, %ymm1
-; X86-AVX512-NEXT: vpand %ymm4, %ymm1, %ymm1
-; X86-AVX512-NEXT: vshufi32x4 {{.*#+}} zmm3 = zmm3[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X86-AVX512-NEXT: vpaddd %ymm3, %ymm1, %ymm1
+; X86-AVX512-NEXT: vpand %ymm3, %ymm1, %ymm1
; X86-AVX512-NEXT: vpaddd %zmm3, %zmm2, %zmm2
; X86-AVX512-NEXT: vpandd %zmm3, %zmm2, %zmm2
; X86-AVX512-NEXT: vmovdqu %xmm0, ha4
@@ -983,24 +982,24 @@ define void @fallback_broadcast_v4i32_v8i32_v16i32(<4 x i32> %a, <8 x i32> %b, <
;
; X64-AVX1-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
; X64-AVX1: # %bb.0: # %entry
-; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,3,4]
+; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4]
+; X64-AVX1-NEXT: # ymm4 = mem[0,1,0,1]
; X64-AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5
; X64-AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm5
-; X64-AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1
-; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
-; X64-AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6
+; X64-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
+; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
+; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5
; X64-AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
-; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
-; X64-AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5
; X64-AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; X64-AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2
-; X64-AVX1-NEXT: vandps %ymm5, %ymm3, %ymm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; X64-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
+; X64-AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; X64-AVX1-NEXT: vmovdqu %xmm0, {{.*}}(%rip)
; X64-AVX1-NEXT: vmovups %ymm1, {{.*}}(%rip)
; X64-AVX1-NEXT: vmovups %ymm3, hc4+{{.*}}(%rip)
@@ -1010,9 +1009,9 @@ define void @fallback_broadcast_v4i32_v8i32_v16i32(<4 x i32> %a, <8 x i32> %b, <
;
; X64-AVX2-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
; X64-AVX2: # %bb.0: # %entry
-; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,3,4]
+; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4]
+; X64-AVX2-NEXT: # ymm4 = mem[0,1,0,1]
; X64-AVX2-NEXT: vpaddd %xmm4, %xmm0, %xmm0
-; X64-AVX2-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4
; X64-AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
; X64-AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
; X64-AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3
@@ -1028,12 +1027,11 @@ define void @fallback_broadcast_v4i32_v8i32_v16i32(<4 x i32> %a, <8 x i32> %b, <
;
; X64-AVX512-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
; X64-AVX512: # %bb.0: # %entry
-; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,3,4]
+; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4]
+; X64-AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512-NEXT: vpaddd %xmm3, %xmm0, %xmm0
-; X64-AVX512-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm4
-; X64-AVX512-NEXT: vpaddd %ymm4, %ymm1, %ymm1
-; X64-AVX512-NEXT: vpand %ymm4, %ymm1, %ymm1
-; X64-AVX512-NEXT: vshufi32x4 {{.*#+}} zmm3 = zmm3[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT: vpaddd %ymm3, %ymm1, %ymm1
+; X64-AVX512-NEXT: vpand %ymm3, %ymm1, %ymm1
; X64-AVX512-NEXT: vpaddd %zmm3, %zmm2, %zmm2
; X64-AVX512-NEXT: vpandd %zmm3, %zmm2, %zmm2
; X64-AVX512-NEXT: vmovdqu %xmm0, {{.*}}(%rip)
More information about the llvm-branch-commits
mailing list