[llvm] [X86] X86FixupVectorConstants - shrink vector load to movsd/movsd/movd/movq 'zero upper' instructions (PR #79000)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 24 05:17:52 PST 2024
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/79000
>From 5ee98b4161781c33ff4f1b7a120cc27fdd37a67b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 22 Jan 2024 16:03:30 +0000
Subject: [PATCH] [X86] X86FixupVectorConstants - shrink vector load to
movsd/movsd/movd/movq 'zero upper' instructions
If we're loading a vector constant that is known to be zero in the upper elements, then attempt to shrink the constant and just scalar load the lower 32/64 bits.
I've extended the constant fixup code to always attempt to use the smallest possible constant load, but for the same constant width prefer the scalar vzload over broadcasts (better chance of avoiding a domain clash).
---
.../Target/X86/X86FixupVectorConstants.cpp | 177 ++++++++++------
.../test/CodeGen/X86/2011-20-21-zext-ui2fp.ll | 2 +-
.../any_extend_vector_inreg_of_broadcast.ll | 5 +-
...d_vector_inreg_of_broadcast_from_memory.ll | 5 +-
llvm/test/CodeGen/X86/avx-load-store.ll | 2 +-
llvm/test/CodeGen/X86/avx2-arith.ll | 2 +-
.../avx512-shuffles/shuffle-chained-bf16.ll | 2 +-
llvm/test/CodeGen/X86/bitreverse.ll | 6 +-
llvm/test/CodeGen/X86/combine-srl.ll | 4 +-
llvm/test/CodeGen/X86/combine-subo.ll | 4 +-
.../test/CodeGen/X86/constant-pool-sharing.ll | 4 +-
llvm/test/CodeGen/X86/dpbusd.ll | 2 +-
llvm/test/CodeGen/X86/dpbusd_const.ll | 6 +-
.../CodeGen/X86/expand-vp-cast-intrinsics.ll | 2 +-
llvm/test/CodeGen/X86/fcmp-constant.ll | 2 +-
.../test/CodeGen/X86/fold-vector-sext-zext.ll | 24 +--
llvm/test/CodeGen/X86/half.ll | 4 +-
llvm/test/CodeGen/X86/icmp-pow2-mask.ll | 4 +-
.../X86/insert-into-constant-vector.ll | 8 +-
llvm/test/CodeGen/X86/insertelement-ones.ll | 4 +-
.../X86/masked_gather_scatter_widen.ll | 2 +-
llvm/test/CodeGen/X86/masked_store_trunc.ll | 8 +-
.../CodeGen/X86/memcmp-more-load-pairs.ll | 2 +-
llvm/test/CodeGen/X86/memcmp.ll | 2 +-
llvm/test/CodeGen/X86/pmaddubsw.ll | 7 +-
llvm/test/CodeGen/X86/pr46532.ll | 2 +-
llvm/test/CodeGen/X86/pr63108.ll | 12 +-
llvm/test/CodeGen/X86/pr74736.ll | 2 +-
llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll | 2 +-
llvm/test/CodeGen/X86/pshufb-mask-comments.ll | 2 +-
llvm/test/CodeGen/X86/ret-mmx.ll | 2 +-
llvm/test/CodeGen/X86/sad.ll | 2 +-
llvm/test/CodeGen/X86/sext-vsetcc.ll | 4 +-
llvm/test/CodeGen/X86/shrink_vmul.ll | 4 +-
llvm/test/CodeGen/X86/shuffle-half.ll | 2 +-
.../X86/shuffle-strided-with-offset-256.ll | 143 +++++--------
llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll | 76 +++----
llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 4 +-
llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll | 56 ++---
llvm/test/CodeGen/X86/vec_anyext.ll | 8 +-
llvm/test/CodeGen/X86/vec_fp_to_int.ll | 10 +-
llvm/test/CodeGen/X86/vec_set-A.ll | 4 +-
llvm/test/CodeGen/X86/vector-blend.ll | 18 +-
.../X86/vector-constrained-fp-intrinsics.ll | 2 +-
llvm/test/CodeGen/X86/vector-fshl-128.ll | 16 +-
llvm/test/CodeGen/X86/vector-fshl-256.ll | 6 +-
llvm/test/CodeGen/X86/vector-fshl-rot-128.ll | 12 +-
llvm/test/CodeGen/X86/vector-fshl-rot-256.ll | 2 +-
.../CodeGen/X86/vector-fshl-rot-sub128.ll | 6 +-
llvm/test/CodeGen/X86/vector-fshl-sub128.ll | 2 +-
llvm/test/CodeGen/X86/vector-fshr-128.ll | 16 +-
llvm/test/CodeGen/X86/vector-fshr-256.ll | 4 +-
llvm/test/CodeGen/X86/vector-fshr-rot-128.ll | 12 +-
llvm/test/CodeGen/X86/vector-fshr-rot-256.ll | 2 +-
.../CodeGen/X86/vector-fshr-rot-sub128.ll | 6 +-
llvm/test/CodeGen/X86/vector-fshr-sub128.ll | 2 +-
.../CodeGen/X86/vector-half-conversions.ll | 4 +-
.../vector-interleaved-load-i16-stride-3.ll | 16 +-
.../vector-interleaved-load-i16-stride-5.ll | 14 +-
.../vector-interleaved-load-i16-stride-6.ll | 14 +-
.../vector-interleaved-load-i16-stride-7.ll | 42 ++--
.../vector-interleaved-load-i16-stride-8.ll | 50 ++---
.../vector-interleaved-load-i32-stride-3.ll | 2 +-
.../vector-interleaved-load-i32-stride-4.ll | 8 +-
.../vector-interleaved-load-i32-stride-6.ll | 24 +--
.../vector-interleaved-load-i32-stride-7.ll | 45 ++--
.../vector-interleaved-load-i32-stride-8.ll | 2 +-
.../vector-interleaved-load-i8-stride-2.ll | 13 +-
.../vector-interleaved-load-i8-stride-4.ll | 108 ++++------
.../vector-interleaved-load-i8-stride-5.ll | 82 +++----
.../vector-interleaved-load-i8-stride-6.ll | 91 +++-----
.../vector-interleaved-load-i8-stride-7.ll | 122 +++++------
.../vector-interleaved-load-i8-stride-8.ll | 200 +++++++++---------
.../vector-interleaved-store-i8-stride-5.ll | 6 +-
.../vector-interleaved-store-i8-stride-7.ll | 3 +-
.../vector-interleaved-store-i8-stride-8.ll | 30 +--
llvm/test/CodeGen/X86/vector-lzcnt-128.ll | 112 +++++-----
llvm/test/CodeGen/X86/vector-lzcnt-256.ll | 16 +-
.../CodeGen/X86/vector-mulfix-legalize.ll | 4 +-
.../CodeGen/X86/vector-reduce-add-mask.ll | 4 +-
llvm/test/CodeGen/X86/vector-rotate-128.ll | 14 +-
llvm/test/CodeGen/X86/vector-rotate-256.ll | 2 +-
.../CodeGen/X86/vector-shift-ashr-sub128.ll | 6 +-
.../CodeGen/X86/vector-shift-lshr-sub128.ll | 6 +-
.../CodeGen/X86/vector-shift-shl-sub128.ll | 4 +-
.../CodeGen/X86/vector-shuffle-128-v16.ll | 11 +-
.../CodeGen/X86/vector-shuffle-256-v16.ll | 8 +-
.../CodeGen/X86/vector-shuffle-256-v32.ll | 34 ++-
.../test/CodeGen/X86/vector-shuffle-256-v8.ll | 8 +-
.../CodeGen/X86/vector-shuffle-512-v32.ll | 4 +-
.../CodeGen/X86/vector-shuffle-512-v64.ll | 8 +-
.../test/CodeGen/X86/vector-shuffle-512-v8.ll | 4 +-
.../CodeGen/X86/vector-shuffle-combining.ll | 56 ++---
llvm/test/CodeGen/X86/vector-shuffle-v1.ll | 6 +-
llvm/test/CodeGen/X86/vector-trunc.ll | 4 +-
llvm/test/CodeGen/X86/vector-tzcnt-128.ll | 28 +--
llvm/test/CodeGen/X86/vselect-constants.ll | 4 +-
llvm/test/CodeGen/X86/vselect-post-combine.ll | 2 +-
llvm/test/CodeGen/X86/widen_bitcnt.ll | 20 +-
.../CodeGen/X86/x86-interleaved-access.ll | 24 +--
.../CodeGen/X86/zero_extend_vector_inreg.ll | 64 +++---
.../zero_extend_vector_inreg_of_broadcast.ll | 17 +-
...d_vector_inreg_of_broadcast_from_memory.ll | 17 +-
103 files changed, 959 insertions(+), 1135 deletions(-)
diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 9da8d7338ea36c..ca52cb384a7ee6 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -67,6 +67,9 @@ FunctionPass *llvm::createX86FixupVectorConstants() {
static std::optional<APInt> extractConstantBits(const Constant *C) {
unsigned NumBits = C->getType()->getPrimitiveSizeInBits();
+ if (auto *CUndef = dyn_cast<UndefValue>(C))
+ return APInt::getZero(NumBits);
+
if (auto *CInt = dyn_cast<ConstantInt>(C))
return CInt->getValue();
@@ -80,6 +83,18 @@ static std::optional<APInt> extractConstantBits(const Constant *C) {
return APInt::getSplat(NumBits, *Bits);
}
}
+
+ APInt Bits = APInt::getZero(NumBits);
+ for (unsigned I = 0, E = CV->getNumOperands(); I != E; ++I) {
+ Constant *Elt = CV->getOperand(I);
+ std::optional<APInt> SubBits = extractConstantBits(Elt);
+ if (!SubBits)
+ return std::nullopt;
+ assert(NumBits == (E * SubBits->getBitWidth()) &&
+ "Illegal vector element size");
+ Bits.insertBits(*SubBits, I * SubBits->getBitWidth());
+ }
+ return Bits;
}
if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
@@ -223,6 +238,35 @@ static Constant *rebuildSplatableConstant(const Constant *C,
return rebuildConstant(OriginalType->getContext(), SclTy, *Splat, NumSclBits);
}
+static Constant *rebuildZeroUpperConstant(const Constant *C,
+ unsigned ScalarBitWidth) {
+ Type *Ty = C->getType();
+ Type *SclTy = Ty->getScalarType();
+ unsigned NumBits = Ty->getPrimitiveSizeInBits();
+ unsigned NumSclBits = SclTy->getPrimitiveSizeInBits();
+ LLVMContext &Ctx = C->getContext();
+
+ if (NumBits > ScalarBitWidth) {
+ // Determine if the upper bits are all zero.
+ if (std::optional<APInt> Bits = extractConstantBits(C)) {
+ if (Bits->countLeadingZeros() >= (NumBits - ScalarBitWidth)) {
+ // If the original constant was made of smaller elements, try to retain
+ // those types.
+ if (ScalarBitWidth > NumSclBits && (ScalarBitWidth % NumSclBits) == 0)
+ return rebuildConstant(Ctx, SclTy, *Bits, NumSclBits);
+
+ // Fallback to raw integer bits.
+ APInt RawBits = Bits->zextOrTrunc(ScalarBitWidth);
+ return ConstantInt::get(Ctx, RawBits);
+ }
+ }
+ }
+
+ return nullptr;
+}
+
+typedef std::function<Constant *(const Constant *, unsigned)> RebuildFn;
+
bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineInstr &MI) {
@@ -233,37 +277,45 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
bool HasBWI = ST->hasBWI();
bool HasVLX = ST->hasVLX();
- auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128,
- unsigned OpBcst64, unsigned OpBcst32,
- unsigned OpBcst16, unsigned OpBcst8,
- unsigned OperandNo) {
- assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
- "Unexpected number of operands!");
-
- if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
- // Attempt to detect a suitable splat from increasing splat widths.
- std::pair<unsigned, unsigned> Broadcasts[] = {
- {8, OpBcst8}, {16, OpBcst16}, {32, OpBcst32},
- {64, OpBcst64}, {128, OpBcst128}, {256, OpBcst256},
- };
- for (auto [BitWidth, OpBcst] : Broadcasts) {
- if (OpBcst) {
- // Construct a suitable splat constant and adjust the MI to
- // use the new constant pool entry.
- if (Constant *NewCst = rebuildSplatableConstant(C, BitWidth)) {
- unsigned NewCPI =
- CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8));
- MI.setDesc(TII->get(OpBcst));
- MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI);
- return true;
+ auto FixupConstant =
+ [&](unsigned OpBcst256, unsigned OpBcst128, unsigned OpBcst64,
+ unsigned OpBcst32, unsigned OpBcst16, unsigned OpBcst8,
+ unsigned OpUpper64, unsigned OpUpper32, unsigned OperandNo) {
+ assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
+ "Unexpected number of operands!");
+
+ if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
+ // Attempt to detect a suitable splat/vzload from increasing constant
+ // bitwidths.
+ // Prefer vzload vs broadcast for same bitwidth to avoid domain flips.
+ std::tuple<unsigned, unsigned, RebuildFn> FixupLoad[] = {
+ {8, OpBcst8, rebuildSplatableConstant},
+ {16, OpBcst16, rebuildSplatableConstant},
+ {32, OpUpper32, rebuildZeroUpperConstant},
+ {32, OpBcst32, rebuildSplatableConstant},
+ {64, OpUpper64, rebuildZeroUpperConstant},
+ {64, OpBcst64, rebuildSplatableConstant},
+ {128, OpBcst128, rebuildSplatableConstant},
+ {256, OpBcst256, rebuildSplatableConstant},
+ };
+ for (auto [BitWidth, Op, RebuildConstant] : FixupLoad) {
+ if (Op) {
+ // Construct a suitable constant and adjust the MI to use the new
+ // constant pool entry.
+ if (Constant *NewCst = RebuildConstant(C, BitWidth)) {
+ unsigned NewCPI =
+ CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8));
+ MI.setDesc(TII->get(Op));
+ MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI);
+ return true;
+ }
+ }
}
}
- }
- }
- return false;
- };
+ return false;
+ };
- // Attempt to convert full width vector loads into broadcast loads.
+ // Attempt to convert full width vector loads into broadcast/vzload loads.
switch (Opc) {
/* FP Loads */
case X86::MOVAPDrm:
@@ -271,79 +323,82 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
case X86::MOVUPDrm:
case X86::MOVUPSrm:
// TODO: SSE3 MOVDDUP Handling
- return false;
+ return FixupConstant(0, 0, 0, 0, 0, 0, X86::MOVSDrm, X86::MOVSSrm, 1);
case X86::VMOVAPDrm:
case X86::VMOVAPSrm:
case X86::VMOVUPDrm:
case X86::VMOVUPSrm:
- return ConvertToBroadcast(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
- 1);
+ return FixupConstant(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
+ X86::VMOVSDrm, X86::VMOVSSrm, 1);
case X86::VMOVAPDYrm:
case X86::VMOVAPSYrm:
case X86::VMOVUPDYrm:
case X86::VMOVUPSYrm:
- return ConvertToBroadcast(0, X86::VBROADCASTF128rm, X86::VBROADCASTSDYrm,
- X86::VBROADCASTSSYrm, 0, 0, 1);
+ return FixupConstant(0, X86::VBROADCASTF128rm, X86::VBROADCASTSDYrm,
+ X86::VBROADCASTSSYrm, 0, 0, 0, 0, 1);
case X86::VMOVAPDZ128rm:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPDZ128rm:
case X86::VMOVUPSZ128rm:
- return ConvertToBroadcast(0, 0, X86::VMOVDDUPZ128rm,
- X86::VBROADCASTSSZ128rm, 0, 0, 1);
+ return FixupConstant(0, 0, X86::VMOVDDUPZ128rm, X86::VBROADCASTSSZ128rm, 0,
+ 0, X86::VMOVSDZrm, X86::VMOVSSZrm, 1);
case X86::VMOVAPDZ256rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVUPDZ256rm:
case X86::VMOVUPSZ256rm:
- return ConvertToBroadcast(0, X86::VBROADCASTF32X4Z256rm,
- X86::VBROADCASTSDZ256rm, X86::VBROADCASTSSZ256rm,
- 0, 0, 1);
+ return FixupConstant(0, X86::VBROADCASTF32X4Z256rm, X86::VBROADCASTSDZ256rm,
+ X86::VBROADCASTSSZ256rm, 0, 0, 0, 0, 1);
case X86::VMOVAPDZrm:
case X86::VMOVAPSZrm:
case X86::VMOVUPDZrm:
case X86::VMOVUPSZrm:
- return ConvertToBroadcast(X86::VBROADCASTF64X4rm, X86::VBROADCASTF32X4rm,
- X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0,
- 1);
+ return FixupConstant(X86::VBROADCASTF64X4rm, X86::VBROADCASTF32X4rm,
+ X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0, 0, 0,
+ 1);
/* Integer Loads */
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ return FixupConstant(0, 0, 0, 0, 0, 0, X86::MOVQI2PQIrm, X86::MOVDI2PDIrm,
+ 1);
case X86::VMOVDQArm:
case X86::VMOVDQUrm:
- return ConvertToBroadcast(
- 0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm,
- HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm,
- HasAVX2 ? X86::VPBROADCASTWrm : 0, HasAVX2 ? X86::VPBROADCASTBrm : 0,
- 1);
+ return FixupConstant(0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm,
+ HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm,
+ HasAVX2 ? X86::VPBROADCASTWrm : 0,
+ HasAVX2 ? X86::VPBROADCASTBrm : 0, X86::VMOVQI2PQIrm,
+ X86::VMOVDI2PDIrm, 1);
case X86::VMOVDQAYrm:
case X86::VMOVDQUYrm:
- return ConvertToBroadcast(
+ return FixupConstant(
0, HasAVX2 ? X86::VBROADCASTI128rm : X86::VBROADCASTF128rm,
HasAVX2 ? X86::VPBROADCASTQYrm : X86::VBROADCASTSDYrm,
HasAVX2 ? X86::VPBROADCASTDYrm : X86::VBROADCASTSSYrm,
HasAVX2 ? X86::VPBROADCASTWYrm : 0, HasAVX2 ? X86::VPBROADCASTBYrm : 0,
- 1);
+ 0, 0, 1);
case X86::VMOVDQA32Z128rm:
case X86::VMOVDQA64Z128rm:
case X86::VMOVDQU32Z128rm:
case X86::VMOVDQU64Z128rm:
- return ConvertToBroadcast(0, 0, X86::VPBROADCASTQZ128rm,
- X86::VPBROADCASTDZ128rm,
- HasBWI ? X86::VPBROADCASTWZ128rm : 0,
- HasBWI ? X86::VPBROADCASTBZ128rm : 0, 1);
+ return FixupConstant(0, 0, X86::VPBROADCASTQZ128rm, X86::VPBROADCASTDZ128rm,
+ HasBWI ? X86::VPBROADCASTWZ128rm : 0,
+ HasBWI ? X86::VPBROADCASTBZ128rm : 0,
+ X86::VMOVQI2PQIZrm, X86::VMOVDI2PDIZrm, 1);
case X86::VMOVDQA32Z256rm:
case X86::VMOVDQA64Z256rm:
case X86::VMOVDQU32Z256rm:
case X86::VMOVDQU64Z256rm:
- return ConvertToBroadcast(0, X86::VBROADCASTI32X4Z256rm,
- X86::VPBROADCASTQZ256rm, X86::VPBROADCASTDZ256rm,
- HasBWI ? X86::VPBROADCASTWZ256rm : 0,
- HasBWI ? X86::VPBROADCASTBZ256rm : 0, 1);
+ return FixupConstant(0, X86::VBROADCASTI32X4Z256rm, X86::VPBROADCASTQZ256rm,
+ X86::VPBROADCASTDZ256rm,
+ HasBWI ? X86::VPBROADCASTWZ256rm : 0,
+ HasBWI ? X86::VPBROADCASTBZ256rm : 0, 0, 0, 1);
case X86::VMOVDQA32Zrm:
case X86::VMOVDQA64Zrm:
case X86::VMOVDQU32Zrm:
case X86::VMOVDQU64Zrm:
- return ConvertToBroadcast(X86::VBROADCASTI64X4rm, X86::VBROADCASTI32X4rm,
- X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm,
- HasBWI ? X86::VPBROADCASTWZrm : 0,
- HasBWI ? X86::VPBROADCASTBZrm : 0, 1);
+ return FixupConstant(X86::VBROADCASTI64X4rm, X86::VBROADCASTI32X4rm,
+ X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm,
+ HasBWI ? X86::VPBROADCASTWZrm : 0,
+ HasBWI ? X86::VPBROADCASTBZrm : 0, 0, 0, 1);
}
auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc32, unsigned OpSrc64) {
@@ -368,7 +423,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
if (OpBcst32 || OpBcst64) {
unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32;
- return ConvertToBroadcast(0, 0, OpBcst64, OpBcst32, 0, 0, OpNo);
+ return FixupConstant(0, 0, OpBcst64, OpBcst32, 0, 0, 0, 0, OpNo);
}
return false;
};
diff --git a/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll b/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
index e4e5d51d272d61..1ae51ee9756388 100644
--- a/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
+++ b/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
@@ -7,7 +7,7 @@
define void @ui_to_fp_conv(ptr nocapture %aFOO, ptr nocapture %RET) nounwind {
; CHECK-LABEL: ui_to_fp_conv:
; CHECK: # %bb.0: # %allocas
-; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: movups %xmm1, 16(%rsi)
; CHECK-NEXT: movups %xmm0, (%rsi)
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index fe48059f9d0e65..e592b714a05dc8 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -1053,7 +1053,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; SSE42-NEXT: paddb 48(%rsi), %xmm2
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 32(%rsi), %xmm1
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSE42-NEXT: pshufb %xmm3, %xmm1
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -1075,8 +1075,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX-NEXT: # xmm3 = mem[0,0]
+; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 2f576fe6715904..d3f6bd20a0127c 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -875,7 +875,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 32(%rdi), %xmm1
; SSE42-NEXT: movdqa 48(%rdi), %xmm2
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSE42-NEXT: pshufb %xmm3, %xmm1
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -894,8 +894,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX-NEXT: # xmm3 = mem[0,0]
+; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/avx-load-store.ll b/llvm/test/CodeGen/X86/avx-load-store.ll
index 33eb704788740a..3f856d33145d86 100644
--- a/llvm/test/CodeGen/X86/avx-load-store.ll
+++ b/llvm/test/CodeGen/X86/avx-load-store.ll
@@ -220,7 +220,7 @@ define void @f_f() nounwind {
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB9_4
; CHECK-NEXT: # %bb.3: # %cif_mixed_test_all
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = [4294967295,0,0,0]
; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax)
; CHECK-NEXT: .LBB9_4: # %cif_mixed_test_any_check
;
diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll
index 3e695811719448..d32143cf33f2fa 100644
--- a/llvm/test/CodeGen/X86/avx2-arith.ll
+++ b/llvm/test/CodeGen/X86/avx2-arith.ll
@@ -234,7 +234,7 @@ define <8 x i16> @mul_const8(<8 x i16> %x) {
define <8 x i32> @mul_const9(<8 x i32> %x) {
; CHECK-LABEL: mul_const9:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,0,0,0]
+; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [2,0,0,0]
; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
%y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll
index 99d6049fc1d865..12ce721b8c5d53 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll
@@ -13,7 +13,7 @@ define <2 x bfloat> @shuffle_chained_v32bf16_v2bf16(<32 x bfloat> %a) {
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-64, %rsp
; CHECK-NEXT: subq $128, %rsp
-; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,16,0,16,0,16,0,16]
+; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [0,16,0,0,0,0,0,0]
; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp)
; CHECK-NEXT: vmovaps (%rsp), %xmm0
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index 2d978b5e991c91..26b1d64874e590 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -587,17 +587,17 @@ define <2 x i16> @fold_v2i16() {
;
; X64-LABEL: fold_v2i16:
; X64: # %bb.0:
-; X64-NEXT: movaps {{.*#+}} xmm0 = [61440,240,u,u,u,u,u,u]
+; X64-NEXT: movss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
; X64-NEXT: retq
;
; X86XOP-LABEL: fold_v2i16:
; X86XOP: # %bb.0:
-; X86XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
+; X86XOP-NEXT: vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
; X86XOP-NEXT: retl
;
; GFNI-LABEL: fold_v2i16:
; GFNI: # %bb.0:
-; GFNI-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
+; GFNI-NEXT: vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
; GFNI-NEXT: retq
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> <i16 15, i16 3840>)
ret <2 x i16> %b
diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index 3e0f581ea01a0a..125196a0819b61 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -356,7 +356,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
; SSE: # %bb.0:
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: pshufb %xmm0, %xmm2
; SSE-NEXT: psrlw $4, %xmm0
@@ -378,7 +378,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
; AVX: # %bb.0:
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
diff --git a/llvm/test/CodeGen/X86/combine-subo.ll b/llvm/test/CodeGen/X86/combine-subo.ll
index 235df0a666ee9f..5e4bba6e0fd35c 100644
--- a/llvm/test/CodeGen/X86/combine-subo.ll
+++ b/llvm/test/CodeGen/X86/combine-subo.ll
@@ -217,13 +217,13 @@ define { <4 x i8>, <4 x i1> } @always_usub_const_vector() nounwind {
define { <4 x i8>, <4 x i1> } @never_usub_const_vector() nounwind {
; SSE-LABEL: never_usub_const_vector:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [127,255,0,254,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE-NEXT: movss {{.*#+}} xmm0 = [127,255,0,254,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: never_usub_const_vector:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [127,255,0,254,127,255,0,254,127,255,0,254,127,255,0,254]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = [127,255,0,254,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: retq
%x = call { <4 x i8>, <4 x i1> } @llvm.usub.with.overflow.v4i8(<4 x i8> <i8 255, i8 255, i8 255, i8 255>, <4 x i8> <i8 128, i8 0, i8 255, i8 1>)
diff --git a/llvm/test/CodeGen/X86/constant-pool-sharing.ll b/llvm/test/CodeGen/X86/constant-pool-sharing.ll
index db5a7810974c9d..062d87ed035fd6 100644
--- a/llvm/test/CodeGen/X86/constant-pool-sharing.ll
+++ b/llvm/test/CodeGen/X86/constant-pool-sharing.ll
@@ -77,7 +77,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
; SSE-LINUX: # %bb.0:
; SSE-LINUX-NEXT: xorps %xmm0, %xmm0
; SSE-LINUX-NEXT: movaps %xmm0, 48(%rdi)
-; SSE-LINUX-NEXT: movaps {{.*#+}} xmm1 = [18446744073709551615,0]
+; SSE-LINUX-NEXT: movsd {{.*#+}} xmm1 = [18446744073709551615,0]
; SSE-LINUX-NEXT: movaps %xmm1, 32(%rdi)
; SSE-LINUX-NEXT: movaps %xmm1, 16(%rdi)
; SSE-LINUX-NEXT: movaps %xmm1, (%rdi)
@@ -92,7 +92,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
; SSE-MSVC: # %bb.0:
; SSE-MSVC-NEXT: xorps %xmm0, %xmm0
; SSE-MSVC-NEXT: movaps %xmm0, 48(%rcx)
-; SSE-MSVC-NEXT: movaps {{.*#+}} xmm1 = [18446744073709551615,0]
+; SSE-MSVC-NEXT: movsd {{.*#+}} xmm1 = [18446744073709551615,0]
; SSE-MSVC-NEXT: movaps %xmm1, 32(%rcx)
; SSE-MSVC-NEXT: movaps %xmm1, 16(%rcx)
; SSE-MSVC-NEXT: movaps %xmm1, (%rcx)
diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll
index 06f11e6d527bde..fbea08eb1e5502 100644
--- a/llvm/test/CodeGen/X86/dpbusd.ll
+++ b/llvm/test/CodeGen/X86/dpbusd.ll
@@ -379,7 +379,7 @@ define i32 @vpdpbusd_2xi32(ptr%a, ptr%b, i32 %c, i32 %n) {
; AVX512VNNI-LABEL: vpdpbusd_2xi32:
; AVX512VNNI: # %bb.0: # %entry
; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512VNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512VNNI-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512VNNI-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512VNNI-NEXT: vpandq %zmm1, %zmm2, %zmm1
diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll
index 2fe9feb06dff67..5862e614265b1f 100644
--- a/llvm/test/CodeGen/X86/dpbusd_const.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_const.ll
@@ -108,7 +108,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
; AVXVNNI: # %bb.0: # %entry
; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVXVNNI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVXVNNI-NEXT: vmovd {{.*#+}} xmm2 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm2, %xmm1
; AVXVNNI-NEXT: vmovd %xmm1, %eax
; AVXVNNI-NEXT: addl %edi, %eax
@@ -118,7 +118,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
; AVX512VNNI: # %bb.0: # %entry
; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512VNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VNNI-NEXT: vmovd {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2
; AVX512VNNI-NEXT: vmovd %xmm2, %eax
@@ -130,7 +130,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
; AVX512VLVNNI: # %bb.0: # %entry
; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512VLVNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVNNI-NEXT: vmovd {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2
; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax
diff --git a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
index 3b015acb69bd2e..0a52dfff71eda4 100644
--- a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
@@ -532,7 +532,7 @@ define <2 x half> @vfptrunc_v2f16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroex
; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2 at PLT
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm1
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4,0,0,0]
+; AVX512-NEXT: vmovss {{.*#+}} xmm0 = [4,0,0,0]
; AVX512-NEXT: vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: addq $40, %rsp
; AVX512-NEXT: .cfi_def_cfa_offset 8
diff --git a/llvm/test/CodeGen/X86/fcmp-constant.ll b/llvm/test/CodeGen/X86/fcmp-constant.ll
index 481a32b39dd377..335cb28213f929 100644
--- a/llvm/test/CodeGen/X86/fcmp-constant.ll
+++ b/llvm/test/CodeGen/X86/fcmp-constant.ll
@@ -92,7 +92,7 @@ define <2 x i64> @fcmp_ueq_v2f64_undef() {
define <2 x i64> @fcmp_ueq_v2f64_undef_elt() {
; CHECK-LABEL: fcmp_ueq_v2f64_undef_elt:
; CHECK: # %bb.0:
-; CHECK-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,0]
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = [18446744073709551615,0]
; CHECK-NEXT: retq
%1 = fcmp ueq <2 x double> <double 0x3FF0000000000000, double 0xFFEFFFFFFFFFFFFF>, <double undef, double 0x3FF0000000000000>
%2 = sext <2 x i1> %1 to <2 x i64>
diff --git a/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll
index 3f8bd24c380492..d31168f4078901 100644
--- a/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll
+++ b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll
@@ -11,14 +11,12 @@
define <4 x i16> @test_sext_4i8_4i16() {
; X32-LABEL: test_sext_4i8_4i16:
; X32: # %bb.0:
-; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,2,65533,0,65535,2,65533]
-; X32-NEXT: # xmm0 = mem[0,0]
+; X32-NEXT: vmovsd {{.*#+}} xmm0 = [0,65535,2,65533,0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_sext_4i8_4i16:
; X64: # %bb.0:
-; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,2,65533,0,65535,2,65533]
-; X64-NEXT: # xmm0 = mem[0,0]
+; X64-NEXT: vmovsd {{.*#+}} xmm0 = [0,65535,2,65533,0,0,0,0]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
@@ -31,14 +29,12 @@ define <4 x i16> @test_sext_4i8_4i16() {
define <4 x i16> @test_sext_4i8_4i16_undef() {
; X32-LABEL: test_sext_4i8_4i16_undef:
; X32: # %bb.0:
-; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,0,65533,0,65535,0,65533]
-; X32-NEXT: # xmm0 = mem[0,0]
+; X32-NEXT: vmovsd {{.*#+}} xmm0 = [0,65535,0,65533,0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_sext_4i8_4i16_undef:
; X64: # %bb.0:
-; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,0,65533,0,65535,0,65533]
-; X64-NEXT: # xmm0 = mem[0,0]
+; X64-NEXT: vmovsd {{.*#+}} xmm0 = [0,65535,0,65533,0,0,0,0]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
@@ -211,14 +207,12 @@ define <8 x i32> @test_sext_8i8_8i32_undef() {
define <4 x i16> @test_zext_4i8_4i16() {
; X32-LABEL: test_zext_4i8_4i16:
; X32: # %bb.0:
-; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,2,253,0,255,2,253]
-; X32-NEXT: # xmm0 = mem[0,0]
+; X32-NEXT: vmovsd {{.*#+}} xmm0 = [0,255,2,253,0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_zext_4i8_4i16:
; X64: # %bb.0:
-; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,2,253,0,255,2,253]
-; X64-NEXT: # xmm0 = mem[0,0]
+; X64-NEXT: vmovsd {{.*#+}} xmm0 = [0,255,2,253,0,0,0,0]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
@@ -267,14 +261,12 @@ define <4 x i64> @test_zext_4i8_4i64() {
define <4 x i16> @test_zext_4i8_4i16_undef() {
; X32-LABEL: test_zext_4i8_4i16_undef:
; X32: # %bb.0:
-; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,0,253,0,255,0,253]
-; X32-NEXT: # xmm0 = mem[0,0]
+; X32-NEXT: vmovsd {{.*#+}} xmm0 = [0,255,0,253,0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_zext_4i8_4i16_undef:
; X64: # %bb.0:
-; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,0,253,0,255,0,253]
-; X64-NEXT: # xmm0 = mem[0,0]
+; X64-NEXT: vmovsd {{.*#+}} xmm0 = [0,255,0,253,0,0,0,0]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index b42f6fdea34b65..d0853fdc748d29 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -2116,7 +2116,7 @@ define void @pr63114() {
; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm0
-; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0]
+; CHECK-LIBCALL-NEXT: movq {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0]
; CHECK-LIBCALL-NEXT: por %xmm2, %xmm0
; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm0
@@ -2181,7 +2181,7 @@ define void @pr63114() {
; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
; CHECK-I686-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
; CHECK-I686-NEXT: pand %xmm1, %xmm0
-; CHECK-I686-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0]
+; CHECK-I686-NEXT: movq {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0]
; CHECK-I686-NEXT: por %xmm2, %xmm0
; CHECK-I686-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
; CHECK-I686-NEXT: pand %xmm3, %xmm0
diff --git a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
index e2b3a23827a018..6d2866f50c6c7c 100644
--- a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
+++ b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
@@ -21,7 +21,7 @@ define <8 x i16> @pow2_mask_v16i8(i8 zeroext %0) {
; SSE41-NEXT: movd %edi, %xmm0
; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,0,0,0,0,0,0,0,0]
; SSE41-NEXT: pand %xmm1, %xmm0
; SSE41-NEXT: pcmpeqb %xmm1, %xmm0
; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
@@ -97,7 +97,7 @@ define i64 @pow2_mask_v8i8(i8 zeroext %0) {
; SSE-NEXT: movd %edi, %xmm0
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,u,u,u,u,u,u,u,u]
+; SSE-NEXT: movq {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,0,0,0,0,0,0,0,0]
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pcmpeqb %xmm1, %xmm0
; SSE-NEXT: movq %xmm0, %rax
diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
index 418d632480328f..bc977e006606e8 100644
--- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
+++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -375,7 +375,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
; X86-SSE-LABEL: elt5_v8i64:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT: movaps {{.*#+}} xmm2 = [4,0,0,0]
+; X86-SSE-NEXT: movss {{.*#+}} xmm2 = [4,0,0,0]
; X86-SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [42,0,1,0]
; X86-SSE-NEXT: movaps {{.*#+}} xmm1 = [2,0,3,0]
@@ -404,7 +404,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
; X86-AVX1-LABEL: elt5_v8i64:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,0,0]
+; X86-AVX1-NEXT: vmovss {{.*#+}} xmm1 = [4,0,0,0]
; X86-AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X86-AVX1-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1
; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
@@ -421,7 +421,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
; X86-AVX2-LABEL: elt5_v8i64:
; X86-AVX2: # %bb.0:
; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,0,0]
+; X86-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [4,0,0,0]
; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X86-AVX2-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1
; X86-AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
@@ -439,7 +439,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
; X86-AVX512F: # %bb.0:
; X86-AVX512F-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX512F-NEXT: vmovaps {{.*#+}} xmm2 = [4,0,0,0]
+; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm2 = [4,0,0,0]
; X86-AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; X86-AVX512F-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll
index ed7cbb0a8430d9..cfcb8798e0b9cc 100644
--- a/llvm/test/CodeGen/X86/insertelement-ones.ll
+++ b/llvm/test/CodeGen/X86/insertelement-ones.ll
@@ -280,7 +280,7 @@ define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) {
;
; AVX1-LABEL: insert_v16i16_x12345x789ABCDEx:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = [65535,0,0,0]
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -385,7 +385,7 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
;
; AVX1-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,0,0]
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = [255,0,0,0]
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
index f63545bcd178e5..aad1b443448503 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
@@ -739,7 +739,7 @@ define <17 x float> @test_mgather_v17f32(ptr %base, <17 x i32> %index)
; WIDEN_AVX2-NEXT: vgatherdps %ymm5, (%rsi,%ymm1,4), %ymm6
; WIDEN_AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; WIDEN_AVX2-NEXT: vgatherdps %ymm3, (%rsi,%ymm0,4), %ymm1
-; WIDEN_AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0]
+; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm0 = [4294967295,0,0,0]
; WIDEN_AVX2-NEXT: vgatherdps %ymm0, (%rsi,%ymm2,4), %ymm4
; WIDEN_AVX2-NEXT: vmovss %xmm4, 64(%rdi)
; WIDEN_AVX2-NEXT: vmovaps %ymm1, 32(%rdi)
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index f78646afccb345..6c21bb4d99e748 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -1395,7 +1395,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; SSE4-LABEL: truncstore_v4i64_v4i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm3, %xmm3
-; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE4-NEXT: movd {{.*#+}} xmm4 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE4-NEXT: pshufb %xmm4, %xmm1
; SSE4-NEXT: pshufb %xmm4, %xmm0
; SSE4-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1435,7 +1435,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm4 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
@@ -3791,7 +3791,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX1-LABEL: truncstore_v8i32_v8i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
@@ -3865,7 +3865,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vmovd {{.*#+}} xmm4 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
index 6eb02bfc1fd0c3..da46ea40655791 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
@@ -947,7 +947,7 @@ define i1 @length24_eq_const(ptr %X) nounwind {
; X64-MIC-AVX: # %bb.0:
; X64-MIC-AVX-NEXT: vmovdqu (%rdi), %xmm0
; X64-MIC-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [959985462,858927408,0,0]
+; X64-MIC-AVX-NEXT: vmovq {{.*#+}} xmm2 = [959985462,858927408,0,0]
; X64-MIC-AVX-NEXT: vpcmpneqd %zmm2, %zmm1, %k0
; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
; X64-MIC-AVX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
index f5e7384362a92b..83cb0d6f973be5 100644
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ b/llvm/test/CodeGen/X86/memcmp.ll
@@ -1015,7 +1015,7 @@ define i1 @length24_eq_const(ptr %X) nounwind {
; X64-MIC-AVX: # %bb.0:
; X64-MIC-AVX-NEXT: vmovdqu (%rdi), %xmm0
; X64-MIC-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [959985462,858927408,0,0]
+; X64-MIC-AVX-NEXT: vmovq {{.*#+}} xmm2 = [959985462,858927408,0,0]
; X64-MIC-AVX-NEXT: vpcmpneqd %zmm2, %zmm1, %k0
; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
; X64-MIC-AVX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1
diff --git a/llvm/test/CodeGen/X86/pmaddubsw.ll b/llvm/test/CodeGen/X86/pmaddubsw.ll
index ea0b4e4b21c775..e46a14673a5171 100644
--- a/llvm/test/CodeGen/X86/pmaddubsw.ll
+++ b/llvm/test/CodeGen/X86/pmaddubsw.ll
@@ -320,8 +320,7 @@ define <8 x i16> @pmaddubsw_bad_extend(ptr %Aptr, ptr %Bptr) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa (%rsi), %xmm1
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
-; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
@@ -349,9 +348,9 @@ define <8 x i16> @pmaddubsw_bad_extend(ptr %Aptr, ptr %Bptr) {
; AVX256: # %bb.0:
; AVX256-NEXT: vmovdqa (%rdi), %xmm0
; AVX256-NEXT: vmovdqa (%rsi), %xmm1
-; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
+; AVX256-NEXT: vmovq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,0,0,0,0,0,0,0]
; AVX256-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX256-NEXT: vmovq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX256-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX256-NEXT: vpshufb %xmm2, %xmm1, %xmm2
; AVX256-NEXT: vpshufb %xmm4, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/pr46532.ll b/llvm/test/CodeGen/X86/pr46532.ll
index ffaa131d5cbb1b..cbc677229ede61 100644
--- a/llvm/test/CodeGen/X86/pr46532.ll
+++ b/llvm/test/CodeGen/X86/pr46532.ll
@@ -7,7 +7,7 @@ define void @WhileWithLoopInvariantOperation.21() {
; CHECK-NEXT: movq (%rax), %rax
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovaps %xmm0, 32(%rax)
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,0,0]
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = [4294967295,4294967295,0,0]
; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax)
while.1.body.preheader:
%0 = load ptr, ptr undef, align 8, !invariant.load !0, !dereferenceable !1, !align !2
diff --git a/llvm/test/CodeGen/X86/pr63108.ll b/llvm/test/CodeGen/X86/pr63108.ll
index 38e45595a8846e..b1576851ed0236 100644
--- a/llvm/test/CodeGen/X86/pr63108.ll
+++ b/llvm/test/CodeGen/X86/pr63108.ll
@@ -11,11 +11,11 @@ define i32 @PR63108() {
; SSE-NEXT: testb %al, %al
; SSE-NEXT: je .LBB0_2
; SSE-NEXT: # %bb.1:
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [251,223,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE-NEXT: movd {{.*#+}} xmm0 = [57339,0,0,0]
; SSE-NEXT: jmp .LBB0_5
; SSE-NEXT: .LBB0_2: # %vector.body.preheader
; SSE-NEXT: pxor %xmm0, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [57339,0,0,0]
+; SSE-NEXT: movd {{.*#+}} xmm1 = [57339,0,0,0]
; SSE-NEXT: xorl %eax, %eax
; SSE-NEXT: .p2align 4, 0x90
; SSE-NEXT: .LBB0_3: # %vector.body
@@ -47,10 +47,10 @@ define i32 @PR63108() {
; AVX1-NEXT: testb %al, %al
; AVX1-NEXT: je .LBB0_2
; AVX1-NEXT: # %bb.1:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [251,223,0,0,251,223,0,0,251,223,0,0,251,223,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = [57339,0,0,0]
; AVX1-NEXT: jmp .LBB0_5
; AVX1-NEXT: .LBB0_2: # %vector.body.preheader
-; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [57339,0,0,0]
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = [57339,0,0,0]
; AVX1-NEXT: xorl %eax, %eax
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB0_3: # %vector.body
@@ -87,7 +87,7 @@ define i32 @PR63108() {
; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [251,223,251,223,251,223,251,223,251,223,251,223,251,223,251,223]
; AVX2-NEXT: jmp .LBB0_5
; AVX2-NEXT: .LBB0_2: # %vector.body.preheader
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [57339,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [57339,0,0,0]
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB0_3: # %vector.body
@@ -124,7 +124,7 @@ define i32 @PR63108() {
; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm0 = [251,223,251,223,251,223,251,223,251,223,251,223,251,223,251,223]
; AVX512-NEXT: jmp .LBB0_5
; AVX512-NEXT: .LBB0_2: # %vector.body.preheader
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [57339,0,0,0]
+; AVX512-NEXT: vmovd {{.*#+}} xmm0 = [57339,0,0,0]
; AVX512-NEXT: xorl %eax, %eax
; AVX512-NEXT: .p2align 4, 0x90
; AVX512-NEXT: .LBB0_3: # %vector.body
diff --git a/llvm/test/CodeGen/X86/pr74736.ll b/llvm/test/CodeGen/X86/pr74736.ll
index 3dfdbf102c953e..1c3b4bd4971c11 100644
--- a/llvm/test/CodeGen/X86/pr74736.ll
+++ b/llvm/test/CodeGen/X86/pr74736.ll
@@ -6,7 +6,7 @@ define void @main(<16 x i32> %0, i32 %1) {
; SSE-LABEL: main:
; SSE: # %bb.0: # %entry
; SSE-NEXT: movd %edi, %xmm4
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,0,0,0]
+; SSE-NEXT: movss {{.*#+}} xmm0 = [1,0,0,0]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,0]
; SSE-NEXT: paddd %xmm0, %xmm0
; SSE-NEXT: paddd %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll
index bbe46a99ffa414..5f13e974874351 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll
@@ -38,7 +38,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) {
define <16 x i8> @testv16i8(<16 x i8> %in) {
; AVX256-LABEL: testv16i8:
; AVX256: # %bb.0:
-; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX256-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX256-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX256-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX256-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pshufb-mask-comments.ll b/llvm/test/CodeGen/X86/pshufb-mask-comments.ll
index ef91e7a3f91075..b96338984d6f55 100644
--- a/llvm/test/CodeGen/X86/pshufb-mask-comments.ll
+++ b/llvm/test/CodeGen/X86/pshufb-mask-comments.ll
@@ -54,7 +54,7 @@ define <16 x i8> @test4(<16 x i8> %V, ptr %P) {
define <16 x i8> @test5(<16 x i8> %V) {
; CHECK-LABEL: test5:
; CHECK: # %bb.0:
-; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1,0,0,0]
+; CHECK-NEXT: movss {{.*#+}} xmm1 = [1,0,0,0]
; CHECK-NEXT: movaps %xmm1, (%rax)
; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1,1]
; CHECK-NEXT: movaps %xmm1, (%rax)
diff --git a/llvm/test/CodeGen/X86/ret-mmx.ll b/llvm/test/CodeGen/X86/ret-mmx.ll
index 815f95e64496b0..81dd73363c1fb1 100644
--- a/llvm/test/CodeGen/X86/ret-mmx.ll
+++ b/llvm/test/CodeGen/X86/ret-mmx.ll
@@ -32,7 +32,7 @@ define <1 x i64> @t2() nounwind {
define <2 x i32> @t3() nounwind {
; CHECK-LABEL: t3:
; CHECK: ## %bb.0:
-; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1,0,0,0]
+; CHECK-NEXT: movss {{.*#+}} xmm0 = [1,0,0,0]
; CHECK-NEXT: retq
ret <2 x i32> <i32 1, i32 0>
}
diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
index 3d3e935045475e..d043234705aa06 100644
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -544,7 +544,7 @@ define dso_local i32 @sad_2i8() nounwind {
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0]
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB3_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/sext-vsetcc.ll b/llvm/test/CodeGen/X86/sext-vsetcc.ll
index 839c972b23dbab..de9d47526b166b 100644
--- a/llvm/test/CodeGen/X86/sext-vsetcc.ll
+++ b/llvm/test/CodeGen/X86/sext-vsetcc.ll
@@ -216,7 +216,7 @@ define <4 x i32> @cmp_ult_load_const(ptr %x) nounwind {
; SSE-LABEL: cmp_ult_load_const:
; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [42,214,0,255,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE-NEXT: movd {{.*#+}} xmm1 = [42,214,0,255,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE-NEXT: pmaxub %xmm0, %xmm1
; SSE-NEXT: pcmpeqb %xmm0, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -243,7 +243,7 @@ define <3 x i32> @cmp_ult_load_const_bad_type(ptr %x) nounwind {
; SSE-LABEL: cmp_ult_load_const_bad_type:
; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [42,214,0,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE-NEXT: movd {{.*#+}} xmm1 = [42,214,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE-NEXT: pmaxub %xmm0, %xmm1
; SSE-NEXT: pcmpeqb %xmm0, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 8dd06e0d848d32..2610f4322c8e2b 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1745,7 +1745,7 @@ define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) {
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,u,u,u,u,u,u]
+; X86-SSE-NEXT: movd {{.*#+}} xmm1 = [0,65535,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm0, %xmm2
; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2
; X86-SSE-NEXT: pmullw %xmm1, %xmm0
@@ -1768,7 +1768,7 @@ define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) {
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq c(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,u,u,u,u,u,u]
+; X64-SSE-NEXT: movd {{.*#+}} xmm1 = [0,65535,0,0,0,0,0,0]
; X64-SSE-NEXT: movdqa %xmm0, %xmm2
; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2
; X64-SSE-NEXT: pmullw %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/shuffle-half.ll b/llvm/test/CodeGen/X86/shuffle-half.ll
index 0d27fc38967668..291fe841043ed4 100644
--- a/llvm/test/CodeGen/X86/shuffle-half.ll
+++ b/llvm/test/CodeGen/X86/shuffle-half.ll
@@ -10,7 +10,7 @@ define <32 x half> @dump_vec() {
; CHECK-NEXT: jne .LBB0_2
; CHECK-NEXT: # %bb.1: # %cond.load
; CHECK-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm0
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
index 42f1bd7824909b..f632654f89e04a 100644
--- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
@@ -12,34 +12,22 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v16i8_1:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX1-NEXT: # xmm2 = mem[0,0]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v16i8_1:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v16i8_1:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v16i8_1:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -50,7 +38,7 @@ define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -147,27 +135,16 @@ define void @shuffle_v8i32_to_v4i32_1(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v8i8_1:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v8i8_1:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vmovq %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
; AVX512F: # %bb.0:
@@ -207,27 +184,16 @@ define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v8i8_2:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v8i8_2:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vmovq %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
; AVX512F: # %bb.0:
@@ -267,27 +233,16 @@ define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v8i8_3(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v8i8_3:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v8i8_3:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vmovq %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
; AVX512F: # %bb.0:
@@ -538,7 +493,7 @@ define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -598,7 +553,7 @@ define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -658,7 +613,7 @@ define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -718,7 +673,7 @@ define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -778,7 +733,7 @@ define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -838,7 +793,7 @@ define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -898,7 +853,7 @@ define void @shuffle_v32i8_to_v4i8_7(ptr %L, ptr %S) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index 0ab141f4c2022b..05dd2344d30f7b 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -382,27 +382,16 @@ define void @trunc_v4i64_to_v4i32(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v8i8(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v8i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vmovq %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
; AVX512F: # %bb.0:
@@ -447,27 +436,16 @@ define void @shuffle_v32i8_to_v8i8(ptr %L, ptr %S) nounwind {
}
define void @trunc_v8i32_to_v8i8(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: trunc_v8i32_to_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_v8i32_to_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: trunc_v8i32_to_v8i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vmovq %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_v8i32_to_v8i8:
; AVX512F: # %bb.0:
@@ -519,7 +497,7 @@ define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -530,7 +508,7 @@ define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -692,7 +670,7 @@ define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -703,7 +681,7 @@ define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -1021,7 +999,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1194,7 +1172,7 @@ define void @shuffle_v32i8_to_v4i8(ptr %L, ptr %S) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1259,7 +1237,7 @@ define void @trunc_v4i64_to_v4i8(ptr %L, ptr %S) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 6e357a5fb34f50..85e160e497172a 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -392,7 +392,7 @@ define <4 x double> @PR34175(ptr %p) {
;
; AVX512BWVL-LABEL: PR34175:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
+; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm1
; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -411,7 +411,7 @@ define <4 x double> @PR34175(ptr %p) {
;
; AVX512VBMIVL-LABEL: PR34175:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
+; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm1
; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll
index 348501caf619a3..777beea886f566 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -834,43 +834,43 @@ declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind rea
define <16 x i8> @test_x86_sse2_packsswb_128_fold() {
; X86-SSE-LABEL: test_x86_sse2_packsswb_128_fold:
; X86-SSE: ## %bb.0:
-; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X86-SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A]
-; X86-SSE-NEXT: ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: ## encoding: [0xf2,0x0f,0x10,0x05,A,A,A,A]
+; X86-SSE-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-SSE-NEXT: retl ## encoding: [0xc3]
;
; X86-AVX1-LABEL: test_x86_sse2_packsswb_128_fold:
; X86-AVX1: ## %bb.0:
-; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X86-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X86-AVX1-NEXT: ## encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A]
; X86-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX1-NEXT: retl ## encoding: [0xc3]
;
; X86-AVX512-LABEL: test_x86_sse2_packsswb_128_fold:
; X86-AVX512: ## %bb.0:
-; X86-AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A]
; X86-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX512-NEXT: retl ## encoding: [0xc3]
;
; X64-SSE-LABEL: test_x86_sse2_packsswb_128_fold:
; X64-SSE: ## %bb.0:
-; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X64-SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A]
-; X64-SSE-NEXT: ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X64-SSE-NEXT: ## encoding: [0xf2,0x0f,0x10,0x05,A,A,A,A]
+; X64-SSE-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-SSE-NEXT: retq ## encoding: [0xc3]
;
; X64-AVX1-LABEL: test_x86_sse2_packsswb_128_fold:
; X64-AVX1: ## %bb.0:
-; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X64-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X64-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X64-AVX1-NEXT: ## encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A]
; X64-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX1-NEXT: retq ## encoding: [0xc3]
;
; X64-AVX512-LABEL: test_x86_sse2_packsswb_128_fold:
; X64-AVX512: ## %bb.0:
-; X64-AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A]
; X64-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX512-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <8 x i16> zeroinitializer)
@@ -902,43 +902,43 @@ declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind rea
define <16 x i8> @test_x86_sse2_packuswb_128_fold() {
; X86-SSE-LABEL: test_x86_sse2_packuswb_128_fold:
; X86-SSE: ## %bb.0:
-; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X86-SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A]
-; X86-SSE-NEXT: ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A]
+; X86-SSE-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-SSE-NEXT: retl ## encoding: [0xc3]
;
; X86-AVX1-LABEL: test_x86_sse2_packuswb_128_fold:
; X86-AVX1: ## %bb.0:
-; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X86-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X86-AVX1-NEXT: ## encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
; X86-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX1-NEXT: retl ## encoding: [0xc3]
;
; X86-AVX512-LABEL: test_x86_sse2_packuswb_128_fold:
; X86-AVX512: ## %bb.0:
-; X86-AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X86-AVX512-NEXT: vmovss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
; X86-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX512-NEXT: retl ## encoding: [0xc3]
;
; X64-SSE-LABEL: test_x86_sse2_packuswb_128_fold:
; X64-SSE: ## %bb.0:
-; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X64-SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A]
-; X64-SSE-NEXT: ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-SSE-NEXT: movss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X64-SSE-NEXT: ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A]
+; X64-SSE-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-SSE-NEXT: retq ## encoding: [0xc3]
;
; X64-AVX1-LABEL: test_x86_sse2_packuswb_128_fold:
; X64-AVX1: ## %bb.0:
-; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X64-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X64-AVX1-NEXT: vmovss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X64-AVX1-NEXT: ## encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
; X64-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX1-NEXT: retq ## encoding: [0xc3]
;
; X64-AVX512-LABEL: test_x86_sse2_packuswb_128_fold:
; X64-AVX512: ## %bb.0:
-; X64-AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X64-AVX512-NEXT: vmovss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
; X64-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX512-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <8 x i16> zeroinitializer)
diff --git a/llvm/test/CodeGen/X86/vec_anyext.ll b/llvm/test/CodeGen/X86/vec_anyext.ll
index cdd30165a99bc3..09e4a4b3a773d1 100644
--- a/llvm/test/CodeGen/X86/vec_anyext.ll
+++ b/llvm/test/CodeGen/X86/vec_anyext.ll
@@ -173,7 +173,7 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vmovdqa (%ecx), %xmm0
; X86-NEXT: vmovdqa 16(%ecx), %xmm1
-; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; X86-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; X86-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; X86-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -211,8 +211,7 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind {
define <4 x i16> @const_16_32() nounwind {
; CHECK-LABEL: const_16_32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [0,3,8,7,0,3,8,7]
-; CHECK-NEXT: # xmm0 = mem[0,0]
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = [0,3,8,7,0,0,0,0]
; CHECK-NEXT: ret{{[l|q]}}
%G = trunc <4 x i32> <i32 0, i32 3, i32 8, i32 7> to <4 x i16>
ret <4 x i16> %G
@@ -221,8 +220,7 @@ define <4 x i16> @const_16_32() nounwind {
define <4 x i16> @const_16_64() nounwind {
; CHECK-LABEL: const_16_64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [0,3,8,7,0,3,8,7]
-; CHECK-NEXT: # xmm0 = mem[0,0]
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = [0,3,8,7,0,0,0,0]
; CHECK-NEXT: ret{{[l|q]}}
%G = trunc <4 x i64> <i64 0, i64 3, i64 8, i64 7> to <4 x i16>
ret <4 x i16> %G
diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
index 1d81cb37730800..a0e9f33483b69c 100644
--- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
@@ -1907,13 +1907,12 @@ define <2 x i64> @fptosi_2f64_to_2i64_const() {
define <4 x i32> @fptosi_2f64_to_2i32_const() {
; SSE-LABEL: fptosi_2f64_to_2i32_const:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,1,u,u]
+; SSE-NEXT: movsd {{.*#+}} xmm0 = [4294967295,1,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f64_to_2i32_const:
; AVX: # %bb.0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,1,4294967295,1]
-; AVX-NEXT: # xmm0 = mem[0,0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4294967295,1,0,0]
; AVX-NEXT: retq
%cvt = fptosi <2 x double> <double -1.0, double 1.0> to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -1966,13 +1965,12 @@ define <2 x i64> @fptoui_2f64_to_2i64_const() {
define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) {
; SSE-LABEL: fptoui_2f64_to_2i32_const:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4,u,u]
+; SSE-NEXT: movsd {{.*#+}} xmm0 = [2,4,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: fptoui_2f64_to_2i32_const:
; AVX: # %bb.0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [2,4,2,4]
-; AVX-NEXT: # xmm0 = mem[0,0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [2,4,0,0]
; AVX-NEXT: retq
%cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/X86/vec_set-A.ll b/llvm/test/CodeGen/X86/vec_set-A.ll
index c8ff250b5bfbcd..a288579bda34ea 100644
--- a/llvm/test/CodeGen/X86/vec_set-A.ll
+++ b/llvm/test/CodeGen/X86/vec_set-A.ll
@@ -5,12 +5,12 @@
define <2 x i64> @test1() nounwind {
; X86-LABEL: test1:
; X86: # %bb.0:
-; X86-NEXT: movaps {{.*#+}} xmm0 = [1,0,0,0]
+; X86-NEXT: movss {{.*#+}} xmm0 = [1,0,0,0]
; X86-NEXT: retl
;
; X64-LABEL: test1:
; X64: # %bb.0:
-; X64-NEXT: movaps {{.*#+}} xmm0 = [1,0,0,0]
+; X64-NEXT: movss {{.*#+}} xmm0 = [1,0,0,0]
; X64-NEXT: retq
ret <2 x i64> < i64 1, i64 0 >
}
diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll
index 73c2f4ca5b10bf..bd5c9363794aa1 100644
--- a/llvm/test/CodeGen/X86/vector-blend.ll
+++ b/llvm/test/CodeGen/X86/vector-blend.ll
@@ -79,22 +79,16 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
; SSE41-LABEL: vsel_4xi8:
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: movss {{.*#+}} xmm0 = [255,255,0,255,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: vsel_4xi8:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: vsel_4xi8:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255]
-; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: vsel_4xi8:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [255,255,0,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
entry:
%vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
ret <4 x i8> %vsel
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index acf45fc4bbeba4..0adb9ddfc426a8 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -4379,7 +4379,7 @@ define <2 x i32> @constrained_vector_fptoui_v2i32_v2f32() #0 {
;
; AVX512-LABEL: constrained_vector_fptoui_v2i32_v2f32:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4.2E+1,4.3E+1,0.0E+0,0.0E+0]
+; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = [4.2E+1,4.3E+1,0.0E+0,0.0E+0]
; AVX512-NEXT: vcvttps2udq %zmm0, %zmm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index 4f100cd3e05309..550b2e06554385 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -1212,7 +1212,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
; SSE-LABEL: splatvar_funnnel_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0]
+; SSE-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0]
; SSE-NEXT: movdqa %xmm2, %xmm4
; SSE-NEXT: pandn %xmm3, %xmm4
; SSE-NEXT: psrlw $1, %xmm1
@@ -1224,7 +1224,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX-LABEL: splatvar_funnnel_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
@@ -1235,7 +1235,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512F-LABEL: splatvar_funnnel_v8i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
@@ -1246,7 +1246,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512VL-LABEL: splatvar_funnnel_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
@@ -1257,7 +1257,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512BW-LABEL: splatvar_funnnel_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
@@ -1278,7 +1278,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
@@ -1295,7 +1295,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; XOP-LABEL: splatvar_funnnel_v8i16:
; XOP: # %bb.0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; XOP-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4
; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1
; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
@@ -1306,7 +1306,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; X86-SSE2-LABEL: splatvar_funnnel_v8i16:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0]
+; X86-SSE2-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0]
; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
; X86-SSE2-NEXT: pandn %xmm3, %xmm4
; X86-SSE2-NEXT: psrlw $1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 05be4e1ee928e4..683fdf15cdea41 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -1000,7 +1000,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm5
@@ -1088,7 +1088,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; XOPAVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; XOPAVX1-NEXT: vpsrlw $1, %xmm5, %xmm5
@@ -1278,7 +1278,7 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) {
; AVX1-NEXT: vmovd %ecx, %xmm2
; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,0,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [31,0,0,0]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: .p2align 4, 0x90
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index 9ddd171b4db690..6a8d9d73f138b4 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -945,7 +945,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; SSE41-LABEL: splatvar_funnnel_v8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,0,0,0]
+; SSE41-NEXT: movd {{.*#+}} xmm2 = [15,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm3
; SSE41-NEXT: pandn %xmm2, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm4
@@ -958,7 +958,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX-LABEL: splatvar_funnnel_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
@@ -969,7 +969,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX512F-LABEL: splatvar_funnnel_v8i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512F-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512F-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
@@ -980,7 +980,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX512VL-LABEL: splatvar_funnnel_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512VL-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
@@ -991,7 +991,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX512BW-LABEL: splatvar_funnnel_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512BW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
@@ -1002,7 +1002,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index 58719e6bd8e0c3..6fc95cc7780ff4 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -757,7 +757,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
index 825ca727b624ea..3452b33ada2a9a 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
@@ -324,7 +324,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512F-LABEL: constant_funnnel_v2i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0]
; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
@@ -338,7 +338,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512BW-LABEL: constant_funnnel_v2i32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0]
; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -352,7 +352,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512VBMI2-LABEL: constant_funnnel_v2i32:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0]
; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
index 0b6361ffd4fae3..64deaf0e75966e 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
@@ -390,7 +390,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4,5,4,5]
+; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm2 = [4,5,0,0]
; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 1f51f02a197ac4..70e3a025167439 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1337,7 +1337,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
; SSE-LABEL: splatvar_funnnel_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0]
+; SSE-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0]
; SSE-NEXT: movdqa %xmm2, %xmm4
; SSE-NEXT: pand %xmm3, %xmm4
; SSE-NEXT: psrlw %xmm4, %xmm1
@@ -1349,7 +1349,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX-LABEL: splatvar_funnnel_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -1360,7 +1360,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512F-LABEL: splatvar_funnnel_v8i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -1371,7 +1371,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512VL-LABEL: splatvar_funnnel_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -1382,7 +1382,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512BW-LABEL: splatvar_funnnel_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -1403,7 +1403,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -1421,7 +1421,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; XOP-LABEL: splatvar_funnnel_v8i16:
; XOP: # %bb.0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; XOP-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -1432,7 +1432,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; X86-SSE2-LABEL: splatvar_funnnel_v8i16:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0]
+; X86-SSE2-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0]
; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
; X86-SSE2-NEXT: pand %xmm3, %xmm4
; X86-SSE2-NEXT: psrlw %xmm4, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 1a6ecea596563e..61aea6ad4d5955 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -1032,7 +1032,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5
@@ -1121,7 +1121,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; XOPAVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 402eb73e18101b..3fa9994312e454 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -982,7 +982,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; SSE41-LABEL: splatvar_funnnel_v8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,0,0,0]
+; SSE41-NEXT: movd {{.*#+}} xmm2 = [15,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm3
; SSE41-NEXT: pand %xmm2, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm4
@@ -995,7 +995,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX-LABEL: splatvar_funnnel_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX-NEXT: vpsrlw %xmm3, %xmm0, %xmm3
; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm1
@@ -1006,7 +1006,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX512F-LABEL: splatvar_funnnel_v8i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512F-NEXT: vpsrlw %xmm3, %xmm0, %xmm3
; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1
@@ -1017,7 +1017,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX512VL-LABEL: splatvar_funnnel_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpsrlw %xmm3, %xmm0, %xmm3
; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1
@@ -1028,7 +1028,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX512BW-LABEL: splatvar_funnnel_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpsrlw %xmm3, %xmm0, %xmm3
; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1
@@ -1039,7 +1039,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm0, %xmm3
; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index bb311468ce913c..b2047a04f163e6 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -796,7 +796,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpsrlw %xmm3, %xmm4, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
index b8c356711921d0..d78aa4e049e0a3 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
@@ -338,7 +338,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512F-LABEL: constant_funnnel_v2i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0]
; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
@@ -352,7 +352,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512BW-LABEL: constant_funnnel_v2i32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0]
; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -366,7 +366,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512VBMI2-LABEL: constant_funnnel_v2i32:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0]
; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
index 56896927e7e5ad..1add344e3e41fb 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
@@ -454,7 +454,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4,5,4,5]
+; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm2 = [4,5,0,0]
; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 21533818bac11a..f59960f06f4a11 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -3173,7 +3173,7 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
; AVX512F-NEXT: callq __truncdfhf2 at PLT
; AVX512F-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [16,0,0,0]
+; AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [16,0,0,0]
; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512F-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -3195,7 +3195,7 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
; AVX512-FASTLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-FASTLANE-NEXT: callq __truncdfhf2 at PLT
; AVX512-FASTLANE-NEXT: vpbroadcastw %xmm0, %xmm1
-; AVX512-FASTLANE-NEXT: vmovaps {{.*#+}} xmm0 = [4,0,0,0]
+; AVX512-FASTLANE-NEXT: vmovss {{.*#+}} xmm0 = [4,0,0,0]
; AVX512-FASTLANE-NEXT: vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
; AVX512-FASTLANE-NEXT: addq $40, %rsp
; AVX512-FASTLANE-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index d2bef6b234e383..47aab9e264c19a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -240,11 +240,11 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512BW-SLOW-LABEL: load_i16_stride3_vf4:
; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,3,6,9,0,3,6,9]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0]
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm1
; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512BW-SLOW-NEXT: vpermi2w %xmm2, %xmm1, %xmm0
-; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,7,10,1,4,7,10]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpermi2w %xmm2, %xmm1, %xmm3
; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
@@ -257,13 +257,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512BW-FAST-LABEL: load_i16_stride3_vf4:
; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,3,6,9,0,3,6,9]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0]
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm1
; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512BW-FAST-NEXT: vpermi2w %xmm2, %xmm1, %xmm0
-; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,7,10,1,4,7,10]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0]
; AVX512BW-FAST-NEXT: vpermi2w %xmm2, %xmm1, %xmm3
-; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,5,8,11,2,5,8,11]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm4 = [2,5,8,11,0,0,0,0]
; AVX512BW-FAST-NEXT: vpermi2w %xmm2, %xmm1, %xmm4
; AVX512BW-FAST-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-FAST-NEXT: vmovq %xmm3, (%rdx)
@@ -967,8 +967,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm11
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm15
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm11[3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1807,8 +1806,7 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm13[2],xmm8[3,4],xmm13[5],xmm8[6,7]
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm14
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm13
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm14[3,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index 904f8ddd2e7f44..0d5445dc194cec 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -409,22 +409,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512BW-LABEL: load_i16_stride5_vf4:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,6,11,0,1,6,11,0]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,5,10,0,0,5,10,0]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512BW-NEXT: vpextrw $7, %xmm2, %eax
; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,7,12,17,2,7,12,17]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0]
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4
; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm2
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,8,13,18,3,8,13,18]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,8,13,18,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm5
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,9,14,19,4,9,14,19]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,19,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm6
; AVX512BW-NEXT: vmovq %xmm1, (%rsi)
; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
@@ -7449,7 +7449,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm15
; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm7
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3]
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,6,7,0,1,10,11,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [1,3,6,0,5,u,u,u]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm9
@@ -7482,7 +7482,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3]
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15]
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm3
; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,4,6,3,6,u,u,u]
; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm27, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index fffe46d3cbed1d..3efae5c1150456 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -213,7 +213,7 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
; AVX512BW-FAST-NEXT: vpbroadcastw 4(%rdi), %xmm4
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,9,3,9,3,9,3,9]
+; AVX512BW-FAST-NEXT: vmovd {{.*#+}} xmm5 = [3,9,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpermi2w %xmm1, %xmm0, %xmm5
; AVX512BW-FAST-NEXT: vpbroadcastw 20(%rdi), %xmm6
; AVX512BW-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm7
@@ -501,19 +501,19 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-LABEL: load_i16_stride6_vf4:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,12,18,0,6,12,18]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,7,13,19,1,7,13,19]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,7,13,19,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,8,14,20,2,8,14,20]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,8,14,20,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,9,15,21,3,9,15,21]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,9,15,21,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,10,16,22,4,10,16,22]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,10,16,22,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,11,17,23,5,11,17,23]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,11,17,23,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index 4b32647f6fb3e7..853f94c84d1a8e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -240,7 +240,7 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpsrlq $48, %xmm1, %xmm8
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX512BW-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [6,13,6,13,6,13,6,13]
+; AVX512BW-FAST-NEXT: vmovd {{.*#+}} xmm8 = [6,13,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpermi2w %xmm1, %xmm0, %xmm8
; AVX512BW-FAST-NEXT: vmovd %xmm2, (%rsi)
; AVX512BW-FAST-NEXT: vmovd %xmm4, (%rdx)
@@ -658,21 +658,21 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,7,14,21,0,7,14,21]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,8,15,22,1,8,15,22]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,9,16,23,2,9,16,23]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,10,17,24,3,10,17,24]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,11,18,25,4,11,18,25]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,12,19,26,5,12,19,26]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [6,13,20,27,6,13,20,27]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
@@ -1343,7 +1343,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX512F-FAST-NEXT: vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm12
; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10
; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
@@ -2380,7 +2380,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm12[1,2,3,4,5,6,7],ymm11[8],ymm12[9,10,11,12,13,14,15]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm12 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm15
; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14
; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7]
@@ -2546,7 +2546,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm14
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm13
; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
@@ -5161,7 +5161,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5,6,7],ymm6[8,9,10,11,12],ymm1[13,14,15]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15
-; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm2 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3]
@@ -5496,7 +5496,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm14
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2
@@ -5566,7 +5566,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7]
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm14
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
@@ -11374,7 +11374,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm4 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm3
; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm10
; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1
@@ -11431,7 +11431,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3
; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
@@ -11493,7 +11493,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7]
; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm4 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
@@ -11522,7 +11522,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm13
; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm6
-; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm2 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm12
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
@@ -12194,7 +12194,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm6 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm6 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm12
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2
@@ -12324,9 +12324,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm13 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm8
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm4 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
index 78065bc73c1d3f..054afe2dfdf2ed 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
@@ -271,23 +271,23 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,9,17,25,1,9,17,25]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,10,18,26,2,10,18,26]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,11,19,27,3,11,19,27]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,12,20,28,4,12,20,28]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,13,21,29,5,13,21,29]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [6,14,22,30,6,14,22,30]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [7,15,23,31,7,15,23,31]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm9
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
@@ -560,7 +560,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
-; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm15 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vpermt2d %xmm13, %xmm15, %xmm14
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
@@ -615,7 +615,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm9 = [1,5,0,0]
; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm2
; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm9, %xmm2
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
@@ -626,7 +626,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpermt2d %xmm10, %xmm1, %xmm0
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [3,7,3,7]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm10 = [3,7,0,0]
; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm10, %xmm15
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm14
@@ -1353,7 +1353,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19
-; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm17 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vpermt2d %xmm4, %xmm17, %xmm14
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3]
; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
@@ -1490,7 +1490,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm15 = [1,5,0,0]
; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm1
; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm15, %xmm1
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
@@ -1523,7 +1523,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm18 = [3,7,3,7]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm18 = [3,7,0,0]
; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm18, %xmm11
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3]
; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
@@ -3238,7 +3238,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vpermt2d %xmm11, %xmm0, %xmm7
; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm6
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0
@@ -3409,7 +3409,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1
-; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [3,7,3,7]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm12 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vpermt2d %xmm16, %xmm12, %xmm4
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3]
@@ -3563,7 +3563,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,5,1,5]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = [1,5,0,0]
; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0
; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm1, %xmm0
; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm11
@@ -3630,7 +3630,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm27 = [3,7,3,7]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm27 = [3,7,0,0]
; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm27, %xmm9
; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm0
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
@@ -3733,7 +3733,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31
; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm0
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,5,1,5]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = [1,5,0,0]
; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm4, %xmm0
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
@@ -7297,7 +7297,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm1
; AVX512F-SLOW-NEXT: vpermt2d %xmm16, %xmm0, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16
@@ -7711,7 +7711,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
-; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm16 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1
; AVX512F-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload
; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -8050,7 +8050,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm9 = [1,5,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm0
; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm9, %xmm0
; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm30
@@ -8225,7 +8225,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1
; AVX512F-FAST-NEXT: vpermt2d %xmm28, %xmm0, %xmm1
; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16
@@ -8477,7 +8477,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0
; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm15 = [1,5,0,0]
; AVX512F-FAST-NEXT: vpermt2d %xmm19, %xmm15, %xmm0
; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1]
@@ -8642,7 +8642,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm17 = [3,7,0,0]
; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; AVX512F-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload
; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index 4ed9a99c58c3f1..cce2340d214aa6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -78,7 +78,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [5,0,5,0]
+; AVX512-FAST-NEXT: vmovd {{.*#+}} xmm3 = [5,0,0,0]
; AVX512-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm3
; AVX512-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm0
; AVX512-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
index 40355e3f7686f6..30ac7dc443182e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
@@ -78,7 +78,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
+; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0]
; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-FAST-NEXT: vmovq %xmm2, (%rsi)
@@ -160,8 +160,7 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5]
-; AVX2-ONLY-NEXT: # xmm7 = mem[0,0]
+; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm7 = [1,5,0,0]
; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
@@ -171,8 +170,7 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [3,7,3,7]
-; AVX2-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = [3,7,0,0]
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm3
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsi)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index d3f9b1c4e15a72..317875ca0ba7a0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -86,12 +86,10 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
-; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [4,2,4,2]
-; AVX2-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm3, %ymm3
-; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
-; AVX2-ONLY-NEXT: # xmm6 = mem[0,0]
+; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0]
; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm6, %ymm0
; AVX2-ONLY-NEXT: vmovlps %xmm4, (%rsi)
; AVX2-ONLY-NEXT: vmovlps %xmm2, (%rdx)
@@ -118,13 +116,11 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-SLOW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = [4,2,4,2]
-; AVX512-SLOW-NEXT: # xmm2 = mem[0,0]
+; AVX512-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = [4,2,0,0]
; AVX512-SLOW-NEXT: vmovaps 32(%rdi), %ymm5
; AVX512-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
; AVX512-SLOW-NEXT: vpermps %ymm5, %ymm2, %ymm2
-; AVX512-SLOW-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
-; AVX512-SLOW-NEXT: # xmm6 = mem[0,0]
+; AVX512-SLOW-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0]
; AVX512-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
; AVX512-SLOW-NEXT: vmovq %xmm3, (%rsi)
; AVX512-SLOW-NEXT: vmovq %xmm1, (%rdx)
@@ -147,15 +143,13 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm4
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4]
; AVX512-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2
-; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,5,3,5]
+; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm5 = [3,5,0,0]
; AVX512-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5
-; AVX512-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
-; AVX512-FAST-NEXT: # xmm1 = mem[0,0]
+; AVX512-FAST-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0]
; AVX512-FAST-NEXT: vmovaps 32(%rdi), %ymm3
; AVX512-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
; AVX512-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1
-; AVX512-FAST-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
-; AVX512-FAST-NEXT: # xmm6 = mem[0,0]
+; AVX512-FAST-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0]
; AVX512-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3
; AVX512-FAST-NEXT: vmovq %xmm0, (%rsi)
; AVX512-FAST-NEXT: vmovq %xmm4, (%rdx)
@@ -308,13 +302,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2]
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm9 = [4,2,4,2]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [4,2,0,0]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm2
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3]
; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3]
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,3,5,3]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm7 = [5,3,0,0]
; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm7, %ymm1
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsi)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
index e4f94b368b7fd5..abd4525a677375 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
@@ -100,8 +100,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
-; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [4,3,4,3]
-; AVX2-ONLY-NEXT: # xmm4 = mem[0,0]
+; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
@@ -134,7 +133,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,11,4,11]
+; AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [4,11,0,0]
; AVX512-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5
; AVX512-SLOW-NEXT: vmovdqa (%rdi), %ymm6
; AVX512-SLOW-NEXT: vpermi2d %ymm5, %ymm6, %ymm1
@@ -165,9 +164,9 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm4
; AVX512-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [7,2,7,2]
+; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm5 = [7,2,0,0]
; AVX512-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm5
-; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,11,4,11]
+; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm0 = [4,11,0,0]
; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512-FAST-NEXT: vmovdqa (%rdi), %ymm6
; AVX512-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
@@ -355,8 +354,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
-; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3]
-; AVX2-SLOW-NEXT: # xmm10 = mem[0,0]
+; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm10 = [4,3,0,0]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
@@ -414,8 +412,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
-; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3]
-; AVX2-FAST-NEXT: # xmm10 = mem[0,0]
+; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm10 = [4,3,0,0]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
@@ -472,8 +469,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
-; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3]
-; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[0,0]
+; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm10 = [4,3,0,0]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
@@ -850,7 +846,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm10
; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm11
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
@@ -954,7 +950,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm10
; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm11
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
@@ -1058,7 +1054,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm10
; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm11
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
@@ -1880,7 +1876,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm5 = [4,3,0,0]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12
; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm5, %ymm10
@@ -2104,7 +2100,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm0
; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm1
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [4,3,4,3]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = [4,3,0,0]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm11
; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm12
@@ -2332,7 +2328,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm3
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm5 = [4,3,0,0]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm12
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm5, %ymm10
@@ -4253,7 +4249,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [4,3,0,0]
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
@@ -4763,7 +4759,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,3,4,3]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = [4,3,0,0]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
@@ -5274,7 +5270,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = [4,3,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
@@ -9154,8 +9150,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm7 = [4,3,4,3]
-; AVX2-SLOW-NEXT: # xmm7 = mem[0,0]
+; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm7 = [4,3,0,0]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
@@ -10195,8 +10190,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm0
; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm9
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm4 = [4,3,4,3]
-; AVX2-FAST-NEXT: # xmm4 = mem[0,0]
+; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
@@ -11245,8 +11239,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm7 = [4,3,4,3]
-; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[0,0]
+; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm7 = [4,3,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
index faa1642831c155..8f0c4b9b726a5e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
@@ -153,7 +153,7 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
+; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0]
; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
index 88ebda3622cc9b..ec029c3f9c033b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
@@ -182,8 +182,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX1-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm3
; AVX1-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
@@ -199,7 +198,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX2-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm3
; AVX2-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX2-ONLY-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
@@ -215,7 +214,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX512F-NEXT: vpand %xmm0, %xmm2, %xmm3
; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
@@ -281,8 +280,7 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX1-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm6
; AVX1-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vpackuswb %xmm6, %xmm0, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
@@ -438,8 +436,7 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX1-ONLY-NEXT: vpand %xmm1, %xmm8, %xmm12
; AVX1-ONLY-NEXT: vpand %xmm1, %xmm7, %xmm1
; AVX1-ONLY-NEXT: vpackuswb %xmm12, %xmm1, %xmm1
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
index 40894a14a681c7..fb9cb361ff8631 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
@@ -214,57 +214,31 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movq %xmm1, (%r8)
; SSE-NEXT: retq
;
-; AVX1-ONLY-LABEL: load_i8_stride4_vf8:
-; AVX1-ONLY: # %bb.0:
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3
-; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5
-; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX1-ONLY-NEXT: vmovq %xmm0, (%rsi)
-; AVX1-ONLY-NEXT: vmovq %xmm3, (%rdx)
-; AVX1-ONLY-NEXT: vmovq %xmm4, (%rcx)
-; AVX1-ONLY-NEXT: vmovq %xmm1, (%r8)
-; AVX1-ONLY-NEXT: retq
-;
-; AVX2-ONLY-LABEL: load_i8_stride4_vf8:
-; AVX2-ONLY: # %bb.0:
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3
-; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5
-; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-ONLY-NEXT: vmovq %xmm3, (%rdx)
-; AVX2-ONLY-NEXT: vmovq %xmm4, (%rcx)
-; AVX2-ONLY-NEXT: vmovq %xmm1, (%r8)
-; AVX2-ONLY-NEXT: retq
+; AVX1-LABEL: load_i8_stride4_vf8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovdqa (%rdi), %xmm1
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX1-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX1-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX1-NEXT: vmovq %xmm0, (%rsi)
+; AVX1-NEXT: vmovq %xmm3, (%rdx)
+; AVX1-NEXT: vmovq %xmm4, (%rcx)
+; AVX1-NEXT: vmovq %xmm1, (%r8)
+; AVX1-NEXT: retq
;
; AVX512-LABEL: load_i8_stride4_vf8:
; AVX512: # %bb.0:
@@ -413,7 +387,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
@@ -422,7 +396,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
@@ -431,7 +405,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm6
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
@@ -440,7 +414,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -461,7 +435,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
@@ -470,7 +444,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
@@ -479,7 +453,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm7
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
@@ -488,7 +462,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -780,7 +754,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm3
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm4
; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5
@@ -804,7 +778,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm10
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm11
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm13
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
@@ -822,7 +796,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm12
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm14
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
@@ -840,7 +814,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
@@ -1545,7 +1519,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm8
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm5
@@ -1611,7 +1585,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -1662,7 +1636,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
@@ -1705,7 +1679,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
index 130207fd9c2ebc..03139923ae5c46 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
@@ -150,47 +150,26 @@ define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movd %xmm3, (%r9)
; SSE-NEXT: retq
;
-; AVX1-ONLY-LABEL: load_i8_stride5_vf4:
-; AVX1-ONLY: # %bb.0:
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
-; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm3
-; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
-; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
-; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5
-; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
-; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6
-; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
-; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX1-ONLY-NEXT: vmovd %xmm3, (%rsi)
-; AVX1-ONLY-NEXT: vmovd %xmm4, (%rdx)
-; AVX1-ONLY-NEXT: vmovd %xmm5, (%rcx)
-; AVX1-ONLY-NEXT: vmovd %xmm6, (%r8)
-; AVX1-ONLY-NEXT: vmovd %xmm0, (%r9)
-; AVX1-ONLY-NEXT: retq
-;
-; AVX2-LABEL: load_i8_stride5_vf4:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
-; AVX2-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm3
-; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
-; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
-; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm5
-; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
-; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6
-; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
-; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovd %xmm3, (%rsi)
-; AVX2-NEXT: vmovd %xmm4, (%rdx)
-; AVX2-NEXT: vmovd %xmm5, (%rcx)
-; AVX2-NEXT: vmovd %xmm6, (%r8)
-; AVX2-NEXT: vmovd %xmm0, (%r9)
-; AVX2-NEXT: retq
+; AVX-LABEL: load_i8_stride5_vf4:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vmovdqa (%rdi), %xmm1
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm3
+; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
+; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm4
+; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
+; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5
+; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
+; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm6
+; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovd %xmm3, (%rsi)
+; AVX-NEXT: vmovd %xmm4, (%rdx)
+; AVX-NEXT: vmovd %xmm5, (%rcx)
+; AVX-NEXT: vmovd %xmm6, (%r8)
+; AVX-NEXT: vmovd %xmm0, (%r9)
+; AVX-NEXT: retq
%wide.vec = load <20 x i8>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
%strided.vec1 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
@@ -707,7 +686,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm1[u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm4, %xmm5, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm5
@@ -1551,7 +1530,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm11, %xmm5
; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm11
@@ -3169,7 +3148,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12
; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm6
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm6, %xmm1
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1
@@ -3191,11 +3170,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,1,6,11,128,128,128,128,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,128,128,128,0,5,10,15,0]
-; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm6
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128]
@@ -3351,8 +3328,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm12
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6,7]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,2,7,12,128,128,128,0,0]
-; AVX1-ONLY-NEXT: # xmm14 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm0
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5]
@@ -3485,8 +3461,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11,u,u,u,u]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,128,128,128,3,8,13,0,0]
-; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm12
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5],xmm12[6,7]
@@ -3494,8 +3469,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm13
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0]
-; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
index 899f38951342e0..ed179d99595468 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
@@ -380,8 +380,7 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,1,2,3,4,128,128,128]
-; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5
@@ -825,7 +824,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm10, %xmm11, %xmm10
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,8,14]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero
@@ -1728,7 +1727,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u]
@@ -1743,13 +1742,11 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0]
-; AVX1-ONLY-NEXT: # xmm8 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm8 = [128,128,128,2,8,14,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm1
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0]
-; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,6,12,128,128,128,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm3
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1
@@ -1763,7 +1760,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm10
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm10, %xmm4
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [0,0,4,10,0,0,4,10,0,0,4,10,0,0,4,10]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm15 = [0,0,4,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm4
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm10
@@ -1804,7 +1801,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm0
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [0,0,5,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm15
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0]
@@ -3597,11 +3594,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm9
; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm10
; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm11
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [128,128,128,4,10,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [2,8,14,128,128,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm1 = [2,8,14,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm13
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12]
@@ -3625,16 +3620,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [3,9,15,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm2
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [128,128,128,5,11,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [3,9,15,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm14
; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm15
@@ -3652,12 +3645,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm12
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0]
-; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [128,128,0,6,12,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1
@@ -3668,7 +3659,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm2
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm1
@@ -3682,13 +3673,11 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [5,11,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,128,128,1,7,13,0,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14
@@ -3715,13 +3704,11 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [128,128,128,2,8,14,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0]
-; AVX1-ONLY-NEXT: # xmm15 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm15 = [0,6,12,128,128,128,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1
@@ -3737,7 +3724,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,4,10,0,0,4,10,0,0,4,10,0,0,4,10]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [0,0,4,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm4
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm5
@@ -3825,12 +3812,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm7, %ymm0
; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [128,128,128,3,9,15,0,0,128,128,128,3,9,15,0,0]
-; AVX1-ONLY-NEXT: # xmm8 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm8 = [128,128,128,3,9,15,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0]
-; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = [1,7,13,128,128,128,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
@@ -3842,7 +3827,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm14
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm14, %xmm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm14
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0]
@@ -3919,12 +3904,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1
; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0]
-; AVX1-ONLY-NEXT: # xmm9 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [128,128,128,4,10,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm13 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm13 = [2,8,14,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1
@@ -3941,7 +3924,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm8, %ymm1
; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm13 = [2,8,14,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm1
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12]
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm5
@@ -3987,11 +3970,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0
; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0]
-; AVX1-ONLY-NEXT: # xmm14 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [128,128,128,5,11,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm13 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm13 = [3,9,15,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm3
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0
@@ -4009,7 +3990,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0
; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [3,9,15,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13]
@@ -4054,11 +4035,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2
; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0]
-; AVX1-ONLY-NEXT: # xmm14 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [128,128,0,6,12,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm2
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0
@@ -4121,12 +4100,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm15 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm15 = [5,11,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm3
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,1,7,13,0,0,0,128,128,1,7,13,0,0,0]
-; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm5
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index 718d4973f97e25..a7a468d15c4021 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -225,16 +225,16 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm5
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm7
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm9
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6
@@ -303,16 +303,16 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpshufb %xmm4, %xmm0, %xmm5
; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpshufb %xmm6, %xmm0, %xmm7
; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX512F-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpshufb %xmm8, %xmm0, %xmm9
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
; AVX512F-NEXT: vpshufb %xmm6, %xmm1, %xmm6
@@ -654,7 +654,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm10
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
@@ -1403,7 +1403,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,3,10],zero,zero,zero,xmm3[u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm5, %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1
@@ -1415,7 +1415,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm11 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm8, %xmm9, %xmm8
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u]
@@ -1455,7 +1455,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[5,12]
; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13
; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm11, %xmm13, %xmm11
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm13 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm14
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
@@ -2935,7 +2935,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,3,10],zero,zero,zero,xmm15[u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u,u,u]
@@ -2952,7 +2952,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,4,11],zero,zero,xmm15[u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u],zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u]
@@ -2961,7 +2961,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,255,255,255,255,255,0,0,0,0,0,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u]
@@ -2988,7 +2988,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,255,255,255,255,255,0,0,0,0,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[0,7,14,u,u,u,u,u,u,u,u,u]
@@ -3096,7 +3096,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[4,11]
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm12 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm4
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4
; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -3115,7 +3115,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm3[7]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[5,12]
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm7
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm10
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7
; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
@@ -3127,7 +3127,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vorps %ymm7, %ymm10, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm7
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
@@ -3153,7 +3153,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vorps %ymm7, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm4
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm10
; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm3
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
@@ -3179,7 +3179,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm13, %ymm5
; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm5
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [6,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[u,u,u,u,2,9],zero,zero,zero,xmm15[u,u,u,u,u,u,u]
@@ -6763,11 +6763,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10
; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8
; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,128,128,128,5,12,0,0,0]
-; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm4
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,7,14,128,128,0,0,0,0,7,14,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,7,14,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5
; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5
@@ -6781,7 +6779,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm8
; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm8
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm7, %xmm8, %xmm7
; AVX1-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm7
@@ -6863,11 +6861,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm0, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,5,12,0,0,0,128,128,128,5,12,0]
-; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,128,128,128,5,12,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm2
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,0,0,7,14,128,128,0,0,0,0,7,14,128,128,0]
-; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,7,14,128,128,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm3
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm5
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [10,128,128,128,0,0,0,3,10,128,128,128,0,0,0,3]
@@ -6877,7 +6873,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm13
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm13
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,255,255,255,255,255,0,0,0,0,0,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm4, %xmm5, %xmm13, %xmm5
; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0
@@ -6888,12 +6884,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,128,128,6,13,0,0,0,128,128,128,6,13,0,0,0]
-; AVX1-ONLY-NEXT: # xmm14 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [128,128,128,6,13,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,1,8,15,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm3
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm3
@@ -6904,7 +6898,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm13
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm13, %xmm13
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm3, %xmm13, %xmm0
; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -6920,11 +6914,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm14, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm12
; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,2,9,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm9 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm2
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,128,128,0,7,14,0,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3]
@@ -6946,11 +6938,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm6
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,3,10,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm3
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,128,128,1,8,15,0,0,0]
-; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm5
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4]
@@ -6968,12 +6958,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm1, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,128,128,128,6,13,0]
-; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm1
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,1,8,15,128,128,0,0,0,1,8,15,128,128,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,1,8,15,128,128,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm2
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
@@ -6984,7 +6972,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm13
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm13, %xmm13
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,255,255,255,255,255,0,0,0,0,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm1, %xmm13, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -6999,12 +6987,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,2,9,128,128,128,0,0,0,2,9,128,128,128,0]
-; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,2,9,128,128,128,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm15
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,0,7,14,0,0,0,128,128,0,7,14,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,128,128,0,7,14,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm5
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128]
@@ -7025,11 +7011,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm14
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,3,10,128,128,128,0]
-; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,1,8,15,0,0,0,128,128,1,8,15,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,128,128,1,8,15,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm4
@@ -7098,10 +7082,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm2, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm0 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm14
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm0 = [0,0,4,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
@@ -7167,7 +7151,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15]
; AVX1-ONLY-NEXT: # xmm11 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm10
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,7,14,0,0,7,14,0,0,7,14,0,0,7,14,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm0 = [0,7,14,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm12
@@ -7286,7 +7270,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm7
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm1, %xmm1
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm7
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7
; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -7341,7 +7325,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm6
; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm11
; AVX1-ONLY-NEXT: vpor %xmm6, %xmm2, %xmm6
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm7
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
@@ -7371,9 +7355,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm3
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm0 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -7442,9 +7426,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3
; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm3
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -7512,7 +7496,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [6,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
@@ -9561,7 +9545,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm1, %ymm2
; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %ymm17, %ymm7, %ymm28
-; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX512F-ONLY-SLOW-NEXT: vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm21, %ymm15
; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm29, %ymm7
; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -9977,7 +9961,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm2, %ymm11, %ymm3
; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %ymm15, %ymm14, %ymm16
-; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX512F-ONLY-FAST-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm13
; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm29, %ymm14
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -10412,7 +10396,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm18, %ymm2
; AVX512DQ-SLOW-NEXT: vpternlogq $226, %ymm21, %ymm7, %ymm16
-; AVX512DQ-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX512DQ-SLOW-NEXT: vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm15
; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm13, %ymm6, %ymm7
; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -10829,7 +10813,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm2, %ymm11, %ymm8
; AVX512DQ-FAST-NEXT: vpternlogq $226, %ymm15, %ymm14, %ymm16
-; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX512DQ-FAST-NEXT: vmovd {{.*#+}} xmm6 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm0, %ymm13
; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm0, %ymm31, %ymm14
; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
index f2133b9e42d30d..0f779732c0fd52 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
@@ -240,37 +240,37 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm0 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm8
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm9
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm8
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
@@ -592,7 +592,7 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2
@@ -600,70 +600,70 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm8
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm7
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm9
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm8
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm9
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm8
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm10
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm10
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm9
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm10 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm10 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm11
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1287,7 +1287,7 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm3
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6
@@ -1295,7 +1295,7 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm10
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm9
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm10 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
@@ -1311,11 +1311,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4,5],xmm9[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm10 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm11
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
@@ -1331,11 +1331,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm11
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,5],xmm10[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm11
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm12 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm12
; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm9
@@ -1351,11 +1351,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm12
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,5],xmm11[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm12 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm12
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm13 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm14
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm13
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
@@ -1370,11 +1370,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm13
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5],xmm12[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm13 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm14
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm13
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm15
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm14
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
@@ -1389,11 +1389,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5],xmm13[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm14
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm15 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm15
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
@@ -1408,11 +1408,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm14
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm15 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm15
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
@@ -1427,11 +1427,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -2850,7 +2850,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
@@ -2862,7 +2862,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm9
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm11
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm13
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
@@ -2914,13 +2914,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm0
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -2965,11 +2965,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm7
@@ -3011,13 +3011,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm8
@@ -3062,12 +3062,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm7
@@ -3108,13 +3108,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -3160,12 +3160,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm7
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
@@ -3208,12 +3208,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -4529,14 +4529,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,6],ymm4[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm12 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm13
; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12
; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm10
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8
; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm9
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm1 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm13
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1
@@ -4577,12 +4577,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm14
; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm12
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm14 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm14
@@ -4619,12 +4619,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm14
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm27
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm14 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm12
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
@@ -4662,13 +4662,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm14
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm22
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm25
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
@@ -4705,13 +4705,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm6
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm4
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm7
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm0
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm4
@@ -4749,11 +4749,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm5
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm14
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm5
@@ -4789,11 +4789,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm5
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm7
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm5
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
@@ -7495,7 +7495,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -7503,7 +7503,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm6
; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -7625,12 +7625,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm15
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
@@ -7724,12 +7724,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
@@ -7822,13 +7822,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14
@@ -7919,13 +7919,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -8019,12 +8019,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm14
@@ -8116,13 +8116,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -8212,13 +8212,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm1 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm0 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm14
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
@@ -10910,14 +10910,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovdqa 432(%rdi), %xmm11
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm2
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm22
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovdqa 400(%rdi), %xmm12
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm6
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm7
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm31
@@ -11008,13 +11008,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm2
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm24
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm11
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm9
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm30
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5
@@ -11100,13 +11100,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm8
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm28
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm9
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11
@@ -11189,13 +11189,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm7
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm8
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm9
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm12
@@ -11278,13 +11278,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm27
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm10
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15
@@ -11372,13 +11372,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm18
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5
@@ -11462,13 +11462,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm8
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm9
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index c4bf0c3630e835..9ceb3a250bc418 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -342,7 +342,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-SLOW-NEXT: shrq $48, %rax
; AVX2-SLOW-NEXT: vmovd %eax, %xmm1
; AVX2-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
; AVX2-SLOW-NEXT: vmovq %xmm0, 32(%r9)
; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9)
@@ -374,7 +374,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FAST-NEXT: shrq $48, %rax
; AVX2-FAST-NEXT: vmovd %eax, %xmm1
; AVX2-FAST-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
; AVX2-FAST-NEXT: vmovq %xmm0, 32(%r9)
; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9)
@@ -406,7 +406,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FAST-PERLANE-NEXT: shrq $48, %rax
; AVX2-FAST-PERLANE-NEXT: vmovd %eax, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, 32(%r9)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 482da013d741be..3aab872fa4a91d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -475,8 +475,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[u,u,u,u,7,15],zero,xmm0[u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[6,u,u,u,u],zero,zero,xmm3[7,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,0,255,255,255,255,0,0,0]
-; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[0,8,u,u,u],zero,zero,xmm2[1,9,u,u,u],zero,zero
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[0,8],zero,zero,xmm1[u,u,u,1,9],zero,zero,xmm1[u,u,u,2,10]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index e60034f7d6b75f..dcafb526790638 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -187,13 +187,11 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1]
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [2,6,10,14,3,7,11,15,2,6,10,14,3,7,11,15]
-; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [2,6,10,14,3,7,11,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,4,8,12,1,5,9,13,0,4,8,12,1,5,9,13]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,4,8,12,1,5,9,13,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -361,47 +359,39 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,2,10,0,0,3,11,0,0,2,10,0,0,3,11]
-; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = [0,0,2,10,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [2,10,0,0,3,11,0,0,2,10,0,0,3,11,0,0]
-; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [2,10,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,8,0,0,1,9,0,0,0,8,0,0,1,9]
-; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,8,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,8,0,0,1,9,0,0,0,8,0,0,1,9,0,0]
-; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [0,8,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,6,14,0,0,7,15,0,0,6,14,0,0,7,15]
-; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [0,0,6,14,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [6,14,0,0,7,15,0,0,6,14,0,0,7,15,0,0]
-; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [6,14,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,4,12,0,0,5,13,0,0,4,12,0,0,5,13]
-; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [0,0,4,12,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [4,12,0,0,5,13,0,0,4,12,0,0,5,13,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [4,12,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
index 122b478577fbfe..5d02bb8b05f181 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -96,7 +96,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
;
; SSSE3-LABEL: testv2i64:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: movdqa %xmm2, %xmm3
; SSSE3-NEXT: pshufb %xmm0, %xmm3
; SSSE3-NEXT: movdqa %xmm0, %xmm1
@@ -128,7 +128,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
;
; SSE41-LABEL: testv2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: pshufb %xmm0, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm1
@@ -160,7 +160,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
;
; AVX1OR2-LABEL: testv2i64:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -188,7 +188,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
;
; AVX512VL-LABEL: testv2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -216,7 +216,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv2i64:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -257,7 +257,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
;
; X86-SSE-LABEL: testv2i64:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm3, %xmm4
; X86-SSE-NEXT: pshufb %xmm0, %xmm4
; X86-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -374,7 +374,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
;
; SSSE3-LABEL: testv2i64u:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: movdqa %xmm2, %xmm3
; SSSE3-NEXT: pshufb %xmm0, %xmm3
; SSSE3-NEXT: movdqa %xmm0, %xmm1
@@ -406,7 +406,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
;
; SSE41-LABEL: testv2i64u:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: pshufb %xmm0, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm1
@@ -438,7 +438,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
;
; AVX1OR2-LABEL: testv2i64u:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -466,7 +466,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
;
; AVX512VL-LABEL: testv2i64u:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -494,7 +494,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv2i64u:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -535,7 +535,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
;
; X86-SSE-LABEL: testv2i64u:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm3, %xmm4
; X86-SSE-NEXT: pshufb %xmm0, %xmm4
; X86-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -656,7 +656,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
;
; SSSE3-LABEL: testv4i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pshufb %xmm0, %xmm2
; SSSE3-NEXT: movdqa %xmm0, %xmm3
@@ -682,7 +682,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
;
; SSE41-LABEL: testv4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: pshufb %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm3
@@ -708,7 +708,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
;
; AVX1OR2-LABEL: testv4i32:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -731,7 +731,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
;
; AVX512VL-LABEL: testv4i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -754,7 +754,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv4i32:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -790,7 +790,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
;
; X86-SSE-LABEL: testv4i32:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm3, %xmm4
; X86-SSE-NEXT: pshufb %xmm0, %xmm4
; X86-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -905,7 +905,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
;
; SSSE3-LABEL: testv4i32u:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pshufb %xmm0, %xmm2
; SSSE3-NEXT: movdqa %xmm0, %xmm3
@@ -931,7 +931,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
;
; SSE41-LABEL: testv4i32u:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: pshufb %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm3
@@ -957,7 +957,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
;
; AVX1OR2-LABEL: testv4i32u:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -980,7 +980,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
;
; AVX512VL-LABEL: testv4i32u:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1003,7 +1003,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv4i32u:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1039,7 +1039,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
;
; X86-SSE-LABEL: testv4i32u:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm3, %xmm4
; X86-SSE-NEXT: pshufb %xmm0, %xmm4
; X86-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -1142,7 +1142,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
;
; SSSE3-LABEL: testv8i16:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pshufb %xmm0, %xmm2
; SSSE3-NEXT: movdqa %xmm0, %xmm3
@@ -1162,7 +1162,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
;
; SSE41-LABEL: testv8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: pshufb %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm3
@@ -1182,7 +1182,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
;
; AVX1OR2-LABEL: testv8i16:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -1200,7 +1200,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
;
; AVX512VL-LABEL: testv8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1218,7 +1218,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv8i16:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1254,7 +1254,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
;
; X86-SSE-LABEL: testv8i16:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm2, %xmm3
; X86-SSE-NEXT: pshufb %xmm0, %xmm3
; X86-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -1350,7 +1350,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
;
; SSSE3-LABEL: testv8i16u:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pshufb %xmm0, %xmm2
; SSSE3-NEXT: movdqa %xmm0, %xmm3
@@ -1370,7 +1370,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
;
; SSE41-LABEL: testv8i16u:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: pshufb %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm3
@@ -1390,7 +1390,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
;
; AVX1OR2-LABEL: testv8i16u:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -1408,7 +1408,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
;
; AVX512VL-LABEL: testv8i16u:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1426,7 +1426,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv8i16u:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1462,7 +1462,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
;
; X86-SSE-LABEL: testv8i16u:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm2, %xmm3
; X86-SSE-NEXT: pshufb %xmm0, %xmm3
; X86-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -1552,7 +1552,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
;
; SSSE3-LABEL: testv16i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pshufb %xmm0, %xmm2
; SSSE3-NEXT: psrlw $4, %xmm0
@@ -1567,7 +1567,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
;
; SSE41-LABEL: testv16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: pshufb %xmm0, %xmm2
; SSE41-NEXT: psrlw $4, %xmm0
@@ -1582,7 +1582,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
;
; AVX1OR2-LABEL: testv16i8:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1595,7 +1595,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
;
; AVX512VL-LABEL: testv16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
@@ -1608,7 +1608,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv16i8:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
@@ -1630,7 +1630,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
;
; X86-SSE-LABEL: testv16i8:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm1, %xmm2
; X86-SSE-NEXT: pshufb %xmm0, %xmm2
; X86-SSE-NEXT: psrlw $4, %xmm0
@@ -1715,7 +1715,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
;
; SSSE3-LABEL: testv16i8u:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pshufb %xmm0, %xmm2
; SSSE3-NEXT: psrlw $4, %xmm0
@@ -1730,7 +1730,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
;
; SSE41-LABEL: testv16i8u:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: pshufb %xmm0, %xmm2
; SSE41-NEXT: psrlw $4, %xmm0
@@ -1745,7 +1745,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
;
; AVX1OR2-LABEL: testv16i8u:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1758,7 +1758,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
;
; AVX512VL-LABEL: testv16i8u:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
@@ -1771,7 +1771,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv16i8u:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
@@ -1793,7 +1793,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
;
; X86-SSE-LABEL: testv16i8u:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm1, %xmm2
; X86-SSE-NEXT: pshufb %xmm0, %xmm2
; X86-SSE-NEXT: psrlw $4, %xmm0
@@ -1812,22 +1812,22 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
define <2 x i64> @foldv2i64() nounwind {
; SSE-LABEL: foldv2i64:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0]
+; SSE-NEXT: movss {{.*#+}} xmm0 = [55,0,0,0]
; SSE-NEXT: retq
;
; NOBW-LABEL: foldv2i64:
; NOBW: # %bb.0:
-; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0]
+; NOBW-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0]
; NOBW-NEXT: retq
;
; AVX512VLBWDQ-LABEL: foldv2i64:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0]
; AVX512VLBWDQ-NEXT: retq
;
; X86-SSE-LABEL: foldv2i64:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0]
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = [55,0,0,0]
; X86-SSE-NEXT: retl
%out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
ret <2 x i64> %out
@@ -1836,22 +1836,22 @@ define <2 x i64> @foldv2i64() nounwind {
define <2 x i64> @foldv2i64u() nounwind {
; SSE-LABEL: foldv2i64u:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0]
+; SSE-NEXT: movss {{.*#+}} xmm0 = [55,0,0,0]
; SSE-NEXT: retq
;
; NOBW-LABEL: foldv2i64u:
; NOBW: # %bb.0:
-; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0]
+; NOBW-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0]
; NOBW-NEXT: retq
;
; AVX512VLBWDQ-LABEL: foldv2i64u:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0]
; AVX512VLBWDQ-NEXT: retq
;
; X86-SSE-LABEL: foldv2i64u:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0]
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = [55,0,0,0]
; X86-SSE-NEXT: retl
%out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
ret <2 x i64> %out
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
index fe6836c045f3be..8a0d9a6134cea5 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
@@ -13,7 +13,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-LABEL: testv4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -199,7 +199,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX1-LABEL: testv4i64u:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -385,7 +385,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-LABEL: testv8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -541,7 +541,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX1-LABEL: testv8i32u:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -697,7 +697,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX1-LABEL: testv16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -818,7 +818,7 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX1-LABEL: testv16i16u:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -939,7 +939,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX1-LABEL: testv32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -1035,7 +1035,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX1-LABEL: testv32i8u:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
index 82afa15079182f..b233855029c582 100644
--- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
+++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
@@ -13,7 +13,7 @@ declare <4 x i16> @llvm.umul.fix.sat.v4i16(<4 x i16>, <4 x i16>, i32 immarg)
define <4 x i16> @smulfix(<4 x i16> %a) {
; CHECK-LABEL: smulfix:
; CHECK: # %bb.0:
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,2,3,4,u,u,u,u]
+; CHECK-NEXT: movq {{.*#+}} xmm1 = [1,2,3,4,0,0,0,0]
; CHECK-NEXT: movdqa %xmm0, %xmm2
; CHECK-NEXT: pmullw %xmm1, %xmm2
; CHECK-NEXT: psrlw $15, %xmm2
@@ -28,7 +28,7 @@ define <4 x i16> @smulfix(<4 x i16> %a) {
define <4 x i16> @umulfix(<4 x i16> %a) {
; CHECK-LABEL: umulfix:
; CHECK: # %bb.0:
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,2,3,4,u,u,u,u]
+; CHECK-NEXT: movq {{.*#+}} xmm1 = [1,2,3,4,0,0,0,0]
; CHECK-NEXT: movdqa %xmm0, %xmm2
; CHECK-NEXT: pmullw %xmm1, %xmm2
; CHECK-NEXT: psrlw $15, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index dbdc3e09fcef08..8056b9a2963c31 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -864,7 +864,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
;
; SSE41-LABEL: test_v4i16_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,32768,16384,8192,u,u,u,u]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [0,32768,16384,8192,0,0,0,0]
; SSE41-NEXT: pmulhuw %xmm0, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
@@ -914,7 +914,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
; AVX512BW-LABEL: test_v4i16_v4i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index 43c9be2dc6f976..ad810c092bf55e 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -719,7 +719,7 @@ define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; X86-SSE2-LABEL: splatvar_rotate_v2i64:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,0,0,0]
+; X86-SSE2-NEXT: movd {{.*#+}} xmm2 = [64,0,0,0]
; X86-SSE2-NEXT: psubq %xmm1, %xmm2
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
; X86-SSE2-NEXT: psllq %xmm1, %xmm3
@@ -815,7 +815,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; SSE41-LABEL: splatvar_rotate_v8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,0,0,0]
+; SSE41-NEXT: movd {{.*#+}} xmm2 = [15,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm3
; SSE41-NEXT: pandn %xmm2, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm4
@@ -828,7 +828,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX-LABEL: splatvar_rotate_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
@@ -839,7 +839,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512F-LABEL: splatvar_rotate_v8i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512F-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512F-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
@@ -850,7 +850,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512VL-LABEL: splatvar_rotate_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512VL-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
@@ -861,7 +861,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512BW-LABEL: splatvar_rotate_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512BW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
@@ -872,7 +872,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512VLBW-LABEL: splatvar_rotate_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index c55335f8495697..dae2cf382b8205 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -649,7 +649,7 @@ define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-LABEL: splatvar_rotate_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index e15be224bec8cd..ff41d883380a86 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -1786,7 +1786,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
;
; SSE41-LABEL: constant_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,u,16384,8192,u,u,u,u]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [0,0,16384,8192,0,0,0,0]
; SSE41-NEXT: pmulhw %xmm0, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE41-NEXT: psraw $1, %xmm0
@@ -1818,7 +1818,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v4i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1896,7 +1896,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v2i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index f340615464cfa7..71719e03c7c6dc 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -1486,7 +1486,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
;
; SSE41-LABEL: constant_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,32768,16384,8192,u,u,u,u]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [0,32768,16384,8192,0,0,0,0]
; SSE41-NEXT: pmulhuw %xmm0, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE41-NEXT: retq
@@ -1511,7 +1511,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v4i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1581,7 +1581,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v2i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
index 4d4642b18878eb..19645fd08c9468 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
@@ -1339,7 +1339,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v4i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1399,7 +1399,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v2i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
index b40be9452ddd77..e298091bfb983f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -1003,7 +1003,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
;
; SSSE3-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: movq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: pshufb %xmm2, %xmm1
; SSSE3-NEXT: pshufb %xmm2, %xmm0
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1011,7 +1011,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
;
; SSE41-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: movq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSE41-NEXT: pshufb %xmm2, %xmm1
; SSE41-NEXT: pshufb %xmm2, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1019,8 +1019,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
;
; AVX1-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1028,7 +1027,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
;
; AVX2-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1036,7 +1035,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
;
; AVX512VLBW-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX512VLBW-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 5b4f15b51ec00a..bac63f2ddb5059 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -539,7 +539,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,12,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,12,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
@@ -580,7 +580,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,13,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [0,0,13,0,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
@@ -621,7 +621,7 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,14,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm1 = [0,14,0,0,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
@@ -662,7 +662,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,0,0,0]
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm1 = [15,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index baa942794ccd8a..1cfa5e6dfdff53 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -1281,7 +1281,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1328,7 +1328,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1375,7 +1375,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1422,7 +1422,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1469,7 +1469,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1516,7 +1516,7 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1563,7 +1563,7 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1610,7 +1610,7 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [31,0,0,0]
+; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [31,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -2565,12 +2565,10 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_
; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [15,14,13,12,11,10,9,8,15,14,13,12,11,10,9,8]
-; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm3 = [15,14,13,12,11,10,9,8,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0]
-; AVX1-NEXT: # xmm5 = mem[0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
@@ -2771,7 +2769,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2785,7 +2783,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_
; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; XOPAVX1-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2803,7 +2801,7 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2817,7 +2815,7 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; XOPAVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; XOPAVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2835,7 +2833,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2849,7 +2847,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; XOPAVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; XOPAVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index a69a1a18f26e50..02cad49d29fdb2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -208,7 +208,7 @@ define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8f32_06000000:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,0,0]
+; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm1 = [0,6,0,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -225,7 +225,7 @@ define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8f32_70000000:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0]
+; AVX2OR512VL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -1808,7 +1808,7 @@ define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8i32_06000000:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,0,0]
+; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm1 = [0,6,0,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -1825,7 +1825,7 @@ define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8i32_70000000:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0]
+; AVX2OR512VL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index fa0ec33bf34080..24b1b42c2dc058 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -199,13 +199,13 @@ define <32 x i16> @shuffle_v32i16_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_1
define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; KNL: ## %bb.0:
-; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; KNL-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; SKX: ## %bb.0:
-; SKX-NEXT: vmovaps {{.*#+}} xmm1 = [65535,0,0,0]
+; SKX-NEXT: vmovss {{.*#+}} xmm1 = [65535,0,0,0]
; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 4df5307316a426..f4cc8522adec56 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -109,25 +109,25 @@ define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512DQ-NEXT: vmovss {{.*#+}} xmm1 = [255,0,0,0]
; AVX512DQ-NEXT: vandps %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512VBMI-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512VBMI-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512VBMI-NEXT: retq
%shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 9bfca824fb71a9..008593a239f869 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -142,7 +142,7 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_70000000:
; ALL: # %bb.0:
-; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0]
+; ALL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0]
; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; ALL-NEXT: ret{{[l|q]}}
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -960,7 +960,7 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_70000000:
; ALL: # %bb.0:
-; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0]
+; ALL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0]
; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; ALL-NEXT: ret{{[l|q]}}
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 1e2ee7b99a608d..d285d07e66049c 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -1744,26 +1744,18 @@ define <4 x i8> @combine_test1c(ptr %a, ptr %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: movss {{.*#+}} xmm0 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: combine_test1c:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_test1c:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
-; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: combine_test1c:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%A = load <4 x i8>, ptr %a
%B = load <4 x i8>, ptr %b
%1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -1838,26 +1830,18 @@ define <4 x i8> @combine_test4c(ptr %a, ptr %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: movss {{.*#+}} xmm0 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: combine_test4c:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_test4c:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255]
-; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: combine_test4c:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%A = load <4 x i8>, ptr %a
%B = load <4 x i8>, ptr %b
%1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
@@ -3187,7 +3171,7 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
define void @PR43024() {
; SSE2-LABEL: PR43024:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; SSE2-NEXT: movaps %xmm0, (%rax)
; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: xorps %xmm1, %xmm1
@@ -3198,7 +3182,7 @@ define void @PR43024() {
;
; SSSE3-LABEL: PR43024:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; SSSE3-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; SSSE3-NEXT: movaps %xmm0, (%rax)
; SSSE3-NEXT: addss %xmm0, %xmm0
; SSSE3-NEXT: xorps %xmm1, %xmm1
@@ -3209,7 +3193,7 @@ define void @PR43024() {
;
; SSE41-LABEL: PR43024:
; SSE41: # %bb.0:
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; SSE41-NEXT: movaps %xmm0, (%rax)
; SSE41-NEXT: addss %xmm0, %xmm0
; SSE41-NEXT: xorps %xmm1, %xmm1
@@ -3220,7 +3204,7 @@ define void @PR43024() {
;
; AVX-LABEL: PR43024:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; AVX-NEXT: vmovaps %xmm0, (%rax)
; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
index 95a57d1bdf3318..aca50c461a7a19 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -42,7 +42,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = [18446744073709551615,0]
; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
@@ -56,7 +56,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z}
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551615,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm2 = [18446744073709551615,0]
; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
@@ -67,7 +67,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
-; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0]
+; VL_BW_DQ-NEXT: vmovq {{.*#+}} xmm1 = [18446744073709551615,0]
; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index f8e3a7a23056fb..3294c7ffee40d2 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -516,7 +516,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
; AVX1-LABEL: trunc8i32_8i8:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -527,7 +527,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
; AVX2-LABEL: trunc8i32_8i8:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
index 3d5947d8e59bd4..3dc43031cea9ef 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -1762,37 +1762,37 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
define <2 x i64> @foldv2i64() nounwind {
; SSE-LABEL: foldv2i64:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0]
+; SSE-NEXT: movss {{.*#+}} xmm0 = [8,0,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: foldv2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; AVX-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: foldv2i64:
; AVX512VPOPCNTDQ: # %bb.0:
-; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX512VPOPCNTDQ-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; AVX512VPOPCNTDQ-NEXT: retq
;
; AVX512VPOPCNTDQVL-LABEL: foldv2i64:
; AVX512VPOPCNTDQVL: # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX512VPOPCNTDQVL-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; AVX512VPOPCNTDQVL-NEXT: retq
;
; BITALG_NOVLX-LABEL: foldv2i64:
; BITALG_NOVLX: # %bb.0:
-; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; BITALG_NOVLX-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: foldv2i64:
; BITALG: # %bb.0:
-; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; BITALG-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; BITALG-NEXT: retq
;
; X86-SSE-LABEL: foldv2i64:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0]
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = [8,0,0,0]
; X86-SSE-NEXT: retl
%out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
ret <2 x i64> %out
@@ -1801,37 +1801,37 @@ define <2 x i64> @foldv2i64() nounwind {
define <2 x i64> @foldv2i64u() nounwind {
; SSE-LABEL: foldv2i64u:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0]
+; SSE-NEXT: movss {{.*#+}} xmm0 = [8,0,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: foldv2i64u:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; AVX-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: foldv2i64u:
; AVX512VPOPCNTDQ: # %bb.0:
-; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX512VPOPCNTDQ-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; AVX512VPOPCNTDQ-NEXT: retq
;
; AVX512VPOPCNTDQVL-LABEL: foldv2i64u:
; AVX512VPOPCNTDQVL: # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX512VPOPCNTDQVL-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; AVX512VPOPCNTDQVL-NEXT: retq
;
; BITALG_NOVLX-LABEL: foldv2i64u:
; BITALG_NOVLX: # %bb.0:
-; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; BITALG_NOVLX-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: foldv2i64u:
; BITALG: # %bb.0:
-; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; BITALG-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; BITALG-NEXT: retq
;
; X86-SSE-LABEL: foldv2i64u:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0]
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = [8,0,0,0]
; X86-SSE-NEXT: retl
%out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
ret <2 x i64> %out
diff --git a/llvm/test/CodeGen/X86/vselect-constants.ll b/llvm/test/CodeGen/X86/vselect-constants.ll
index 0630a40b88099e..050b3329a4abb6 100644
--- a/llvm/test/CodeGen/X86/vselect-constants.ll
+++ b/llvm/test/CodeGen/X86/vselect-constants.ll
@@ -281,7 +281,7 @@ define i32 @wrong_min_signbits(<2 x i16> %x) {
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: pcmpeqw %xmm0, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [1,0,0,0]
+; SSE-NEXT: movd {{.*#+}} xmm0 = [1,0,0,0]
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE-NEXT: por %xmm0, %xmm1
@@ -292,7 +292,7 @@ define i32 @wrong_min_signbits(<2 x i16> %x) {
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,0,0,0]
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = [2,0,0,0]
; AVX-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vselect-post-combine.ll b/llvm/test/CodeGen/X86/vselect-post-combine.ll
index e91b8d029bcb4a..474f70f78937eb 100644
--- a/llvm/test/CodeGen/X86/vselect-post-combine.ll
+++ b/llvm/test/CodeGen/X86/vselect-post-combine.ll
@@ -4,7 +4,7 @@
define ptr @test_mul(ptr %addr) {
; AVX2-LABEL: test_mul:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [255,0,0,0]
; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX2-NEXT: vpblendvb %xmm0, (%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
diff --git a/llvm/test/CodeGen/X86/widen_bitcnt.ll b/llvm/test/CodeGen/X86/widen_bitcnt.ll
index 0f121d88b3573d..18cb5e1b86ec31 100644
--- a/llvm/test/CodeGen/X86/widen_bitcnt.ll
+++ b/llvm/test/CodeGen/X86/widen_bitcnt.ll
@@ -345,7 +345,7 @@ define <8 x i32> @widen_ctpop_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32
define <4 x i32> @widen_ctlz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
; SSE42-LABEL: widen_ctlz_v2i32_v4i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE42-NEXT: movdqa %xmm3, %xmm6
; SSE42-NEXT: pshufb %xmm0, %xmm6
; SSE42-NEXT: movdqa %xmm0, %xmm5
@@ -394,7 +394,7 @@ define <4 x i32> @widen_ctlz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
;
; AVX2-LABEL: widen_ctlz_v2i32_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3
; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm4
; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -448,7 +448,7 @@ define <4 x i32> @widen_ctlz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
define <8 x i32> @widen_ctlz_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) {
; SSE42-LABEL: widen_ctlz_v4i32_v8i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE42-NEXT: movdqa %xmm3, %xmm6
; SSE42-NEXT: pshufb %xmm0, %xmm6
; SSE42-NEXT: movdqa %xmm0, %xmm5
@@ -535,7 +535,7 @@ define <8 x i32> @widen_ctlz_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) {
define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> %a2, <2 x i32> %a3) {
; SSE42-LABEL: widen_ctlz_v2i32_v8i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT: movq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE42-NEXT: movdqa %xmm5, %xmm8
; SSE42-NEXT: pshufb %xmm0, %xmm8
; SSE42-NEXT: movdqa %xmm0, %xmm7
@@ -629,7 +629,7 @@ define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>
; AVX2: # %bb.0:
; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vmovq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm4
; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm6
; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -724,7 +724,7 @@ define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>
define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
; SSE42-LABEL: widen_ctlz_undef_v2i32_v4i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE42-NEXT: movdqa %xmm3, %xmm6
; SSE42-NEXT: pshufb %xmm0, %xmm6
; SSE42-NEXT: movdqa %xmm0, %xmm5
@@ -773,7 +773,7 @@ define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
;
; AVX2-LABEL: widen_ctlz_undef_v2i32_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3
; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm4
; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -827,7 +827,7 @@ define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
define <8 x i32> @widen_ctlz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) {
; SSE42-LABEL: widen_ctlz_undef_v4i32_v8i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE42-NEXT: movdqa %xmm3, %xmm6
; SSE42-NEXT: pshufb %xmm0, %xmm6
; SSE42-NEXT: movdqa %xmm0, %xmm5
@@ -914,7 +914,7 @@ define <8 x i32> @widen_ctlz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) {
define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> %a2, <2 x i32> %a3) {
; SSE42-LABEL: widen_ctlz_undef_v2i32_v8i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT: movq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE42-NEXT: movdqa %xmm5, %xmm8
; SSE42-NEXT: pshufb %xmm0, %xmm8
; SSE42-NEXT: movdqa %xmm0, %xmm7
@@ -1008,7 +1008,7 @@ define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2
; AVX2: # %bb.0:
; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vmovq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm4
; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm6
; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 742a3a0c66a2ac..06140b2395fca3 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -444,7 +444,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm5
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
@@ -453,7 +453,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6
; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
@@ -463,7 +463,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6
; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-NEXT: vmovd {{.*#+}} xmm6 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
@@ -472,7 +472,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-NEXT: vmovd {{.*#+}} xmm4 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -492,7 +492,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
@@ -501,7 +501,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
@@ -511,7 +511,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
@@ -520,7 +520,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -571,7 +571,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm4
; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm5
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT: vmovd {{.*#+}} xmm8 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm5
; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm7
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
@@ -591,7 +591,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm8
; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm12
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-NEXT: vmovd {{.*#+}} xmm12 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13
; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
@@ -609,7 +609,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpshufb %xmm10, %xmm3, %xmm11
; AVX1-NEXT: vpshufb %xmm10, %xmm2, %xmm12
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-NEXT: vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13
; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
@@ -625,7 +625,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm3
; AVX1-NEXT: vpshufb %xmm12, %xmm2, %xmm2
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
index 023ac96181d3cc..8c9dc90d2a71da 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
@@ -1686,7 +1686,7 @@ define void @vec256_v32i8_to_v2i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: paddb (%rsi), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -1783,7 +1783,7 @@ define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1794,7 +1794,7 @@ define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1805,7 +1805,7 @@ define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -1997,7 +1997,7 @@ define void @vec256_v16i16_to_v2i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -2126,7 +2126,7 @@ define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
@@ -2137,7 +2137,7 @@ define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -2148,7 +2148,7 @@ define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -3360,7 +3360,7 @@ define void @vec384_v48i8_to_v3i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: paddb (%rsi), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
@@ -3483,7 +3483,7 @@ define void @vec384_v48i8_to_v2i192_factor24(ptr %in.vec.base.ptr, ptr %in.vec.b
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -3609,7 +3609,7 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -3622,7 +3622,7 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
@@ -3635,7 +3635,7 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -4261,7 +4261,7 @@ define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
@@ -4442,7 +4442,7 @@ define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec.
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -4589,7 +4589,7 @@ define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -4602,7 +4602,7 @@ define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
@@ -5998,7 +5998,7 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: paddb (%rsi), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
@@ -6182,7 +6182,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: paddb (%rsi), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -6217,7 +6217,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
@@ -6231,7 +6231,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
@@ -6245,7 +6245,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm1
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -6297,7 +6297,7 @@ define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -6310,7 +6310,7 @@ define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
@@ -6323,7 +6323,7 @@ define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -6573,7 +6573,7 @@ define void @vec512_v32i16_to_v4i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
@@ -6755,7 +6755,7 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -6809,7 +6809,7 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
@@ -6823,7 +6823,7 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm1
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
@@ -6837,7 +6837,7 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512BW-NEXT: vpand %ymm0, %ymm1, %ymm1
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -6908,7 +6908,7 @@ define void @vec512_v32i16_to_v1i512_factor32(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -6921,7 +6921,7 @@ define void @vec512_v32i16_to_v1i512_factor32(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index b4c79aff5cef1f..8ab53140eb9110 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -1053,7 +1053,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; SSE42-NEXT: paddb 48(%rsi), %xmm2
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 32(%rsi), %xmm1
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSE42-NEXT: pshufb %xmm3, %xmm1
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -1075,8 +1075,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX-NEXT: # xmm3 = mem[0,0]
+; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -6055,7 +6054,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -6068,7 +6067,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -6081,7 +6080,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -6520,7 +6519,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -6533,7 +6532,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -6546,7 +6545,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index 23f02e9245eeda..c362bdaa3217d0 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -875,7 +875,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 32(%rdi), %xmm1
; SSE42-NEXT: movdqa 48(%rdi), %xmm2
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSE42-NEXT: pshufb %xmm3, %xmm1
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -894,8 +894,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX-NEXT: # xmm3 = mem[0,0]
+; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -4884,7 +4883,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
;
; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [255,0,0,0]
; AVX2-NEXT: vpand (%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
@@ -4895,7 +4894,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
;
; AVX512F-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = [255,0,0,0]
; AVX512F-NEXT: vpand (%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
@@ -4906,7 +4905,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
;
; AVX512DQ-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0]
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm0 = [255,0,0,0]
; AVX512DQ-NEXT: vpand (%rdi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
@@ -5291,7 +5290,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
;
; AVX2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [65535,0,0,0]
; AVX2-NEXT: vpand (%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
@@ -5302,7 +5301,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
;
; AVX512F-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = [65535,0,0,0]
; AVX512F-NEXT: vpand (%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
@@ -5313,7 +5312,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
;
; AVX512DQ-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0]
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm0 = [65535,0,0,0]
; AVX512DQ-NEXT: vpand (%rdi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
More information about the llvm-commits
mailing list