[llvm] [X86] X86FixupVectorConstants - shrink vector load to movsd/movsd/movd/movq 'zero upper' instructions (PR #79000)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 23 10:34:09 PST 2024
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/79000
>From 63343ed2e64b7b3c6a38ba7f079284da55bf2240 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 22 Jan 2024 16:03:30 +0000
Subject: [PATCH] [X86] X86FixupVectorConstants - shrink vector load to
movsd/movsd/movd/movq 'zero upper' instructions
If we're loading a vector constant that is known to be zero in the upper elements, then attempt to shrink the constant and just scalar load the lower 32/64 bits.
Prefer this over a broadcast load (even if the broadcast would have used a smaller constant), as scalar loads can sometimes be performed on more ports than broadcast loads.
---
.../Target/X86/X86FixupVectorConstants.cpp | 95 +-
.../test/CodeGen/X86/2011-20-21-zext-ui2fp.ll | 2 +-
.../any_extend_vector_inreg_of_broadcast.ll | 5 +-
...d_vector_inreg_of_broadcast_from_memory.ll | 5 +-
llvm/test/CodeGen/X86/avx-load-store.ll | 2 +-
llvm/test/CodeGen/X86/avx2-arith.ll | 2 +-
.../avx512-shuffles/shuffle-chained-bf16.ll | 2 +-
llvm/test/CodeGen/X86/bitreverse.ll | 6 +-
llvm/test/CodeGen/X86/combine-srl.ll | 4 +-
llvm/test/CodeGen/X86/combine-subo.ll | 4 +-
.../test/CodeGen/X86/constant-pool-sharing.ll | 4 +-
llvm/test/CodeGen/X86/dpbusd.ll | 2 +-
llvm/test/CodeGen/X86/dpbusd_const.ll | 6 +-
.../CodeGen/X86/expand-vp-cast-intrinsics.ll | 2 +-
llvm/test/CodeGen/X86/fcmp-constant.ll | 2 +-
.../test/CodeGen/X86/fold-vector-sext-zext.ll | 24 +-
llvm/test/CodeGen/X86/half.ll | 4 +-
.../X86/insert-into-constant-vector.ll | 8 +-
llvm/test/CodeGen/X86/insertelement-ones.ll | 4 +-
.../X86/masked_gather_scatter_widen.ll | 2 +-
llvm/test/CodeGen/X86/masked_store_trunc.ll | 10 +-
.../CodeGen/X86/memcmp-more-load-pairs.ll | 2 +-
llvm/test/CodeGen/X86/memcmp.ll | 2 +-
llvm/test/CodeGen/X86/pmaddubsw.ll | 7 +-
llvm/test/CodeGen/X86/pr46532.ll | 2 +-
llvm/test/CodeGen/X86/pr63108.ll | 16 +-
llvm/test/CodeGen/X86/pr74736.ll | 2 +-
llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll | 2 +-
llvm/test/CodeGen/X86/pshufb-mask-comments.ll | 2 +-
llvm/test/CodeGen/X86/ret-mmx.ll | 2 +-
llvm/test/CodeGen/X86/sad.ll | 2 +-
llvm/test/CodeGen/X86/sext-vsetcc.ll | 4 +-
llvm/test/CodeGen/X86/shrink_vmul.ll | 4 +-
llvm/test/CodeGen/X86/shuffle-half.ll | 2 +-
.../X86/shuffle-strided-with-offset-256.ll | 348 ++--
llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll | 136 +-
llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 4 +-
llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll | 56 +-
llvm/test/CodeGen/X86/vec_anyext.ll | 10 +-
llvm/test/CodeGen/X86/vec_fp_to_int.ll | 10 +-
llvm/test/CodeGen/X86/vec_set-A.ll | 4 +-
llvm/test/CodeGen/X86/vector-blend.ll | 18 +-
.../X86/vector-constrained-fp-intrinsics.ll | 2 +-
llvm/test/CodeGen/X86/vector-fshl-128.ll | 16 +-
llvm/test/CodeGen/X86/vector-fshl-256.ll | 6 +-
llvm/test/CodeGen/X86/vector-fshl-rot-128.ll | 12 +-
llvm/test/CodeGen/X86/vector-fshl-rot-256.ll | 2 +-
.../CodeGen/X86/vector-fshl-rot-sub128.ll | 6 +-
llvm/test/CodeGen/X86/vector-fshl-sub128.ll | 2 +-
llvm/test/CodeGen/X86/vector-fshr-128.ll | 16 +-
llvm/test/CodeGen/X86/vector-fshr-256.ll | 4 +-
llvm/test/CodeGen/X86/vector-fshr-rot-128.ll | 12 +-
llvm/test/CodeGen/X86/vector-fshr-rot-256.ll | 2 +-
.../CodeGen/X86/vector-fshr-rot-sub128.ll | 6 +-
llvm/test/CodeGen/X86/vector-fshr-sub128.ll | 2 +-
.../CodeGen/X86/vector-half-conversions.ll | 4 +-
.../vector-interleaved-load-i16-stride-3.ll | 16 +-
.../vector-interleaved-load-i16-stride-5.ll | 14 +-
.../vector-interleaved-load-i16-stride-6.ll | 14 +-
.../vector-interleaved-load-i16-stride-7.ll | 42 +-
.../vector-interleaved-load-i16-stride-8.ll | 50 +-
.../vector-interleaved-load-i32-stride-3.ll | 2 +-
.../vector-interleaved-load-i32-stride-4.ll | 8 +-
.../vector-interleaved-load-i32-stride-6.ll | 24 +-
.../vector-interleaved-load-i32-stride-7.ll | 45 +-
.../vector-interleaved-load-i32-stride-8.ll | 2 +-
.../vector-interleaved-load-i8-stride-2.ll | 13 +-
.../vector-interleaved-load-i8-stride-4.ll | 156 +-
.../vector-interleaved-load-i8-stride-5.ll | 82 +-
.../vector-interleaved-load-i8-stride-6.ll | 107 +-
.../vector-interleaved-load-i8-stride-7.ll | 375 ++--
.../vector-interleaved-load-i8-stride-8.ll | 1569 ++++++++---------
.../vector-interleaved-store-i8-stride-5.ll | 6 +-
.../vector-interleaved-store-i8-stride-7.ll | 3 +-
.../vector-interleaved-store-i8-stride-8.ll | 30 +-
llvm/test/CodeGen/X86/vector-lzcnt-128.ll | 112 +-
llvm/test/CodeGen/X86/vector-lzcnt-256.ll | 16 +-
.../CodeGen/X86/vector-mulfix-legalize.ll | 4 +-
.../CodeGen/X86/vector-reduce-add-mask.ll | 4 +-
llvm/test/CodeGen/X86/vector-rotate-128.ll | 14 +-
llvm/test/CodeGen/X86/vector-rotate-256.ll | 2 +-
.../CodeGen/X86/vector-shift-ashr-sub128.ll | 6 +-
.../CodeGen/X86/vector-shift-lshr-sub128.ll | 6 +-
.../CodeGen/X86/vector-shift-shl-sub128.ll | 4 +-
.../CodeGen/X86/vector-shuffle-128-v16.ll | 11 +-
.../CodeGen/X86/vector-shuffle-256-v16.ll | 8 +-
.../CodeGen/X86/vector-shuffle-256-v32.ll | 36 +-
.../test/CodeGen/X86/vector-shuffle-256-v8.ll | 8 +-
.../CodeGen/X86/vector-shuffle-512-v32.ll | 4 +-
.../CodeGen/X86/vector-shuffle-512-v64.ll | 8 +-
.../test/CodeGen/X86/vector-shuffle-512-v8.ll | 4 +-
.../CodeGen/X86/vector-shuffle-combining.ll | 56 +-
llvm/test/CodeGen/X86/vector-shuffle-v1.ll | 6 +-
llvm/test/CodeGen/X86/vector-trunc.ll | 4 +-
llvm/test/CodeGen/X86/vector-tzcnt-128.ll | 28 +-
llvm/test/CodeGen/X86/vselect-constants.ll | 4 +-
llvm/test/CodeGen/X86/vselect-post-combine.ll | 2 +-
llvm/test/CodeGen/X86/widen_bitcnt.ll | 20 +-
.../CodeGen/X86/x86-interleaved-access.ll | 56 +-
.../CodeGen/X86/zero_extend_vector_inreg.ll | 64 +-
.../zero_extend_vector_inreg_of_broadcast.ll | 17 +-
...d_vector_inreg_of_broadcast_from_memory.ll | 17 +-
102 files changed, 1735 insertions(+), 2266 deletions(-)
diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 9da8d7338ea36c5..75ce3c39fbd3187 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -67,6 +67,9 @@ FunctionPass *llvm::createX86FixupVectorConstants() {
static std::optional<APInt> extractConstantBits(const Constant *C) {
unsigned NumBits = C->getType()->getPrimitiveSizeInBits();
+ if (auto *CUndef = dyn_cast<UndefValue>(C))
+ return APInt::getZero(NumBits);
+
if (auto *CInt = dyn_cast<ConstantInt>(C))
return CInt->getValue();
@@ -80,6 +83,18 @@ static std::optional<APInt> extractConstantBits(const Constant *C) {
return APInt::getSplat(NumBits, *Bits);
}
}
+
+ APInt Bits = APInt::getZero(NumBits);
+ for (unsigned I = 0, E = CV->getNumOperands(); I != E; ++I) {
+ Constant *Elt = CV->getOperand(I);
+ std::optional<APInt> SubBits = extractConstantBits(Elt);
+ if (!SubBits)
+ return std::nullopt;
+ assert(NumBits == (E * SubBits->getBitWidth()) &&
+ "Illegal vector element size");
+ Bits.insertBits(*SubBits, I * SubBits->getBitWidth());
+ }
+ return Bits;
}
if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
@@ -223,6 +238,33 @@ static Constant *rebuildSplatableConstant(const Constant *C,
return rebuildConstant(OriginalType->getContext(), SclTy, *Splat, NumSclBits);
}
+static Constant *rebuildZeroUpperConstant(const Constant *C,
+ unsigned ScalarBitWidth) {
+ Type *Ty = C->getType();
+ Type *SclTy = Ty->getScalarType();
+ unsigned NumBits = Ty->getPrimitiveSizeInBits();
+ unsigned NumSclBits = SclTy->getPrimitiveSizeInBits();
+ LLVMContext &Ctx = C->getContext();
+
+ if (NumBits > ScalarBitWidth) {
+ // Determine if the upper bits are all zero.
+ if (std::optional<APInt> Bits = extractConstantBits(C)) {
+ if (Bits->countLeadingZeros() >= (NumBits - ScalarBitWidth)) {
+ // If the original constant was made of smaller elements, try to retain
+ // those types.
+ if (ScalarBitWidth > NumSclBits && (ScalarBitWidth % NumSclBits) == 0)
+ return rebuildConstant(Ctx, SclTy, *Bits, NumSclBits);
+
+ // Fallback to raw integer bits.
+ APInt RawBits = Bits->zextOrTrunc(ScalarBitWidth);
+ return ConstantInt::get(Ctx, RawBits);
+ }
+ }
+ }
+
+ return nullptr;
+}
+
bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineInstr &MI) {
@@ -263,6 +305,34 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
return false;
};
+ auto ConvertToZeroUpper = [&](unsigned OpUpper64, unsigned OpUpper32,
+ unsigned OperandNo) {
+ assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
+ "Unexpected number of operands!");
+
+ if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
+ // Attempt to detect a suitable splat from increasing splat widths.
+ std::pair<unsigned, unsigned> ZeroUppers[] = {
+ {32, OpUpper32},
+ {64, OpUpper64},
+ };
+ for (auto [BitWidth, OpUpper] : ZeroUppers) {
+ if (OpUpper) {
+ // Construct a suitable splat constant and adjust the MI to
+ // use the new constant pool entry.
+ if (Constant *NewCst = rebuildZeroUpperConstant(C, BitWidth)) {
+ unsigned NewCPI =
+ CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8));
+ MI.setDesc(TII->get(OpUpper));
+ MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI);
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ };
+
// Attempt to convert full width vector loads into broadcast loads.
switch (Opc) {
/* FP Loads */
@@ -271,12 +341,13 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
case X86::MOVUPDrm:
case X86::MOVUPSrm:
// TODO: SSE3 MOVDDUP Handling
- return false;
+ return ConvertToZeroUpper(X86::MOVSDrm, X86::MOVSSrm, 1);
case X86::VMOVAPDrm:
case X86::VMOVAPSrm:
case X86::VMOVUPDrm:
case X86::VMOVUPSrm:
- return ConvertToBroadcast(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
+ return ConvertToZeroUpper(X86::VMOVSDrm, X86::VMOVSSrm, 1) ||
+ ConvertToBroadcast(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
1);
case X86::VMOVAPDYrm:
case X86::VMOVAPSYrm:
@@ -288,7 +359,8 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
case X86::VMOVAPSZ128rm:
case X86::VMOVUPDZ128rm:
case X86::VMOVUPSZ128rm:
- return ConvertToBroadcast(0, 0, X86::VMOVDDUPZ128rm,
+ return ConvertToZeroUpper(X86::VMOVSDZrm, X86::VMOVSSZrm, 1) ||
+ ConvertToBroadcast(0, 0, X86::VMOVDDUPZ128rm,
X86::VBROADCASTSSZ128rm, 0, 0, 1);
case X86::VMOVAPDZ256rm:
case X86::VMOVAPSZ256rm:
@@ -305,13 +377,17 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0,
1);
/* Integer Loads */
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ return ConvertToZeroUpper(X86::MOVQI2PQIrm, X86::MOVDI2PDIrm, 1);
case X86::VMOVDQArm:
case X86::VMOVDQUrm:
- return ConvertToBroadcast(
- 0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm,
- HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm,
- HasAVX2 ? X86::VPBROADCASTWrm : 0, HasAVX2 ? X86::VPBROADCASTBrm : 0,
- 1);
+ return ConvertToZeroUpper(X86::VMOVQI2PQIrm, X86::VMOVDI2PDIrm, 1) ||
+ ConvertToBroadcast(
+ 0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm,
+ HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm,
+ HasAVX2 ? X86::VPBROADCASTWrm : 0,
+ HasAVX2 ? X86::VPBROADCASTBrm : 0, 1);
case X86::VMOVDQAYrm:
case X86::VMOVDQUYrm:
return ConvertToBroadcast(
@@ -324,7 +400,8 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
case X86::VMOVDQA64Z128rm:
case X86::VMOVDQU32Z128rm:
case X86::VMOVDQU64Z128rm:
- return ConvertToBroadcast(0, 0, X86::VPBROADCASTQZ128rm,
+ return ConvertToZeroUpper(X86::VMOVQI2PQIZrm, X86::VMOVDI2PDIZrm, 1) ||
+ ConvertToBroadcast(0, 0, X86::VPBROADCASTQZ128rm,
X86::VPBROADCASTDZ128rm,
HasBWI ? X86::VPBROADCASTWZ128rm : 0,
HasBWI ? X86::VPBROADCASTBZ128rm : 0, 1);
diff --git a/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll b/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
index e4e5d51d272d618..1ae51ee97563885 100644
--- a/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
+++ b/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
@@ -7,7 +7,7 @@
define void @ui_to_fp_conv(ptr nocapture %aFOO, ptr nocapture %RET) nounwind {
; CHECK-LABEL: ui_to_fp_conv:
; CHECK: # %bb.0: # %allocas
-; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: movups %xmm1, 16(%rsi)
; CHECK-NEXT: movups %xmm0, (%rsi)
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index fe48059f9d0e653..e592b714a05dc81 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -1053,7 +1053,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; SSE42-NEXT: paddb 48(%rsi), %xmm2
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 32(%rsi), %xmm1
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSE42-NEXT: pshufb %xmm3, %xmm1
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -1075,8 +1075,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX-NEXT: # xmm3 = mem[0,0]
+; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 2f576fe67159046..d3f6bd20a0127ca 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -875,7 +875,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 32(%rdi), %xmm1
; SSE42-NEXT: movdqa 48(%rdi), %xmm2
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSE42-NEXT: pshufb %xmm3, %xmm1
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -894,8 +894,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX-NEXT: # xmm3 = mem[0,0]
+; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/avx-load-store.ll b/llvm/test/CodeGen/X86/avx-load-store.ll
index 33eb704788740ab..3f856d33145d86b 100644
--- a/llvm/test/CodeGen/X86/avx-load-store.ll
+++ b/llvm/test/CodeGen/X86/avx-load-store.ll
@@ -220,7 +220,7 @@ define void @f_f() nounwind {
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB9_4
; CHECK-NEXT: # %bb.3: # %cif_mixed_test_all
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = [4294967295,0,0,0]
; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax)
; CHECK-NEXT: .LBB9_4: # %cif_mixed_test_any_check
;
diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll
index 3e6958117194485..d32143cf33f2fa8 100644
--- a/llvm/test/CodeGen/X86/avx2-arith.ll
+++ b/llvm/test/CodeGen/X86/avx2-arith.ll
@@ -234,7 +234,7 @@ define <8 x i16> @mul_const8(<8 x i16> %x) {
define <8 x i32> @mul_const9(<8 x i32> %x) {
; CHECK-LABEL: mul_const9:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,0,0,0]
+; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [2,0,0,0]
; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
%y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll
index 99d6049fc1d8658..12ce721b8c5d534 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll
@@ -13,7 +13,7 @@ define <2 x bfloat> @shuffle_chained_v32bf16_v2bf16(<32 x bfloat> %a) {
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-64, %rsp
; CHECK-NEXT: subq $128, %rsp
-; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,16,0,16,0,16,0,16]
+; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [0,16,0,0,0,0,0,0]
; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp)
; CHECK-NEXT: vmovaps (%rsp), %xmm0
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index 2d978b5e991c913..26b1d64874e5905 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -587,17 +587,17 @@ define <2 x i16> @fold_v2i16() {
;
; X64-LABEL: fold_v2i16:
; X64: # %bb.0:
-; X64-NEXT: movaps {{.*#+}} xmm0 = [61440,240,u,u,u,u,u,u]
+; X64-NEXT: movss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
; X64-NEXT: retq
;
; X86XOP-LABEL: fold_v2i16:
; X86XOP: # %bb.0:
-; X86XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
+; X86XOP-NEXT: vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
; X86XOP-NEXT: retl
;
; GFNI-LABEL: fold_v2i16:
; GFNI: # %bb.0:
-; GFNI-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
+; GFNI-NEXT: vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
; GFNI-NEXT: retq
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> <i16 15, i16 3840>)
ret <2 x i16> %b
diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index 3e0f581ea01a0a4..125196a0819b61e 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -356,7 +356,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
; SSE: # %bb.0:
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: pshufb %xmm0, %xmm2
; SSE-NEXT: psrlw $4, %xmm0
@@ -378,7 +378,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
; AVX: # %bb.0:
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
diff --git a/llvm/test/CodeGen/X86/combine-subo.ll b/llvm/test/CodeGen/X86/combine-subo.ll
index 235df0a666ee9f8..5e4bba6e0fd35c3 100644
--- a/llvm/test/CodeGen/X86/combine-subo.ll
+++ b/llvm/test/CodeGen/X86/combine-subo.ll
@@ -217,13 +217,13 @@ define { <4 x i8>, <4 x i1> } @always_usub_const_vector() nounwind {
define { <4 x i8>, <4 x i1> } @never_usub_const_vector() nounwind {
; SSE-LABEL: never_usub_const_vector:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [127,255,0,254,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE-NEXT: movss {{.*#+}} xmm0 = [127,255,0,254,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: never_usub_const_vector:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [127,255,0,254,127,255,0,254,127,255,0,254,127,255,0,254]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = [127,255,0,254,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: retq
%x = call { <4 x i8>, <4 x i1> } @llvm.usub.with.overflow.v4i8(<4 x i8> <i8 255, i8 255, i8 255, i8 255>, <4 x i8> <i8 128, i8 0, i8 255, i8 1>)
diff --git a/llvm/test/CodeGen/X86/constant-pool-sharing.ll b/llvm/test/CodeGen/X86/constant-pool-sharing.ll
index db5a7810974c9da..062d87ed035fd67 100644
--- a/llvm/test/CodeGen/X86/constant-pool-sharing.ll
+++ b/llvm/test/CodeGen/X86/constant-pool-sharing.ll
@@ -77,7 +77,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
; SSE-LINUX: # %bb.0:
; SSE-LINUX-NEXT: xorps %xmm0, %xmm0
; SSE-LINUX-NEXT: movaps %xmm0, 48(%rdi)
-; SSE-LINUX-NEXT: movaps {{.*#+}} xmm1 = [18446744073709551615,0]
+; SSE-LINUX-NEXT: movsd {{.*#+}} xmm1 = [18446744073709551615,0]
; SSE-LINUX-NEXT: movaps %xmm1, 32(%rdi)
; SSE-LINUX-NEXT: movaps %xmm1, 16(%rdi)
; SSE-LINUX-NEXT: movaps %xmm1, (%rdi)
@@ -92,7 +92,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
; SSE-MSVC: # %bb.0:
; SSE-MSVC-NEXT: xorps %xmm0, %xmm0
; SSE-MSVC-NEXT: movaps %xmm0, 48(%rcx)
-; SSE-MSVC-NEXT: movaps {{.*#+}} xmm1 = [18446744073709551615,0]
+; SSE-MSVC-NEXT: movsd {{.*#+}} xmm1 = [18446744073709551615,0]
; SSE-MSVC-NEXT: movaps %xmm1, 32(%rcx)
; SSE-MSVC-NEXT: movaps %xmm1, 16(%rcx)
; SSE-MSVC-NEXT: movaps %xmm1, (%rcx)
diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll
index 06f11e6d527bded..fbea08eb1e5502a 100644
--- a/llvm/test/CodeGen/X86/dpbusd.ll
+++ b/llvm/test/CodeGen/X86/dpbusd.ll
@@ -379,7 +379,7 @@ define i32 @vpdpbusd_2xi32(ptr%a, ptr%b, i32 %c, i32 %n) {
; AVX512VNNI-LABEL: vpdpbusd_2xi32:
; AVX512VNNI: # %bb.0: # %entry
; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512VNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512VNNI-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512VNNI-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512VNNI-NEXT: vpandq %zmm1, %zmm2, %zmm1
diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll
index 2fe9feb06dff67c..5862e614265b1f4 100644
--- a/llvm/test/CodeGen/X86/dpbusd_const.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_const.ll
@@ -108,7 +108,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
; AVXVNNI: # %bb.0: # %entry
; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVXVNNI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVXVNNI-NEXT: vmovd {{.*#+}} xmm2 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm2, %xmm1
; AVXVNNI-NEXT: vmovd %xmm1, %eax
; AVXVNNI-NEXT: addl %edi, %eax
@@ -118,7 +118,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
; AVX512VNNI: # %bb.0: # %entry
; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512VNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VNNI-NEXT: vmovd {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2
; AVX512VNNI-NEXT: vmovd %xmm2, %eax
@@ -130,7 +130,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
; AVX512VLVNNI: # %bb.0: # %entry
; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512VLVNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVNNI-NEXT: vmovd {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2
; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax
diff --git a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
index 3b015acb69bd2ee..0a52dfff71eda41 100644
--- a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
@@ -532,7 +532,7 @@ define <2 x half> @vfptrunc_v2f16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroex
; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2 at PLT
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm1
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4,0,0,0]
+; AVX512-NEXT: vmovss {{.*#+}} xmm0 = [4,0,0,0]
; AVX512-NEXT: vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: addq $40, %rsp
; AVX512-NEXT: .cfi_def_cfa_offset 8
diff --git a/llvm/test/CodeGen/X86/fcmp-constant.ll b/llvm/test/CodeGen/X86/fcmp-constant.ll
index 481a32b39dd377e..335cb28213f9290 100644
--- a/llvm/test/CodeGen/X86/fcmp-constant.ll
+++ b/llvm/test/CodeGen/X86/fcmp-constant.ll
@@ -92,7 +92,7 @@ define <2 x i64> @fcmp_ueq_v2f64_undef() {
define <2 x i64> @fcmp_ueq_v2f64_undef_elt() {
; CHECK-LABEL: fcmp_ueq_v2f64_undef_elt:
; CHECK: # %bb.0:
-; CHECK-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,0]
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = [18446744073709551615,0]
; CHECK-NEXT: retq
%1 = fcmp ueq <2 x double> <double 0x3FF0000000000000, double 0xFFEFFFFFFFFFFFFF>, <double undef, double 0x3FF0000000000000>
%2 = sext <2 x i1> %1 to <2 x i64>
diff --git a/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll
index 3f8bd24c380492e..d31168f4078901a 100644
--- a/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll
+++ b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll
@@ -11,14 +11,12 @@
define <4 x i16> @test_sext_4i8_4i16() {
; X32-LABEL: test_sext_4i8_4i16:
; X32: # %bb.0:
-; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,2,65533,0,65535,2,65533]
-; X32-NEXT: # xmm0 = mem[0,0]
+; X32-NEXT: vmovsd {{.*#+}} xmm0 = [0,65535,2,65533,0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_sext_4i8_4i16:
; X64: # %bb.0:
-; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,2,65533,0,65535,2,65533]
-; X64-NEXT: # xmm0 = mem[0,0]
+; X64-NEXT: vmovsd {{.*#+}} xmm0 = [0,65535,2,65533,0,0,0,0]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
@@ -31,14 +29,12 @@ define <4 x i16> @test_sext_4i8_4i16() {
define <4 x i16> @test_sext_4i8_4i16_undef() {
; X32-LABEL: test_sext_4i8_4i16_undef:
; X32: # %bb.0:
-; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,0,65533,0,65535,0,65533]
-; X32-NEXT: # xmm0 = mem[0,0]
+; X32-NEXT: vmovsd {{.*#+}} xmm0 = [0,65535,0,65533,0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_sext_4i8_4i16_undef:
; X64: # %bb.0:
-; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,0,65533,0,65535,0,65533]
-; X64-NEXT: # xmm0 = mem[0,0]
+; X64-NEXT: vmovsd {{.*#+}} xmm0 = [0,65535,0,65533,0,0,0,0]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
@@ -211,14 +207,12 @@ define <8 x i32> @test_sext_8i8_8i32_undef() {
define <4 x i16> @test_zext_4i8_4i16() {
; X32-LABEL: test_zext_4i8_4i16:
; X32: # %bb.0:
-; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,2,253,0,255,2,253]
-; X32-NEXT: # xmm0 = mem[0,0]
+; X32-NEXT: vmovsd {{.*#+}} xmm0 = [0,255,2,253,0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_zext_4i8_4i16:
; X64: # %bb.0:
-; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,2,253,0,255,2,253]
-; X64-NEXT: # xmm0 = mem[0,0]
+; X64-NEXT: vmovsd {{.*#+}} xmm0 = [0,255,2,253,0,0,0,0]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
@@ -267,14 +261,12 @@ define <4 x i64> @test_zext_4i8_4i64() {
define <4 x i16> @test_zext_4i8_4i16_undef() {
; X32-LABEL: test_zext_4i8_4i16_undef:
; X32: # %bb.0:
-; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,0,253,0,255,0,253]
-; X32-NEXT: # xmm0 = mem[0,0]
+; X32-NEXT: vmovsd {{.*#+}} xmm0 = [0,255,0,253,0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_zext_4i8_4i16_undef:
; X64: # %bb.0:
-; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,0,253,0,255,0,253]
-; X64-NEXT: # xmm0 = mem[0,0]
+; X64-NEXT: vmovsd {{.*#+}} xmm0 = [0,255,0,253,0,0,0,0]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index b42f6fdea34b65f..d0853fdc748d293 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -2116,7 +2116,7 @@ define void @pr63114() {
; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm0
-; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0]
+; CHECK-LIBCALL-NEXT: movq {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0]
; CHECK-LIBCALL-NEXT: por %xmm2, %xmm0
; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm0
@@ -2181,7 +2181,7 @@ define void @pr63114() {
; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
; CHECK-I686-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
; CHECK-I686-NEXT: pand %xmm1, %xmm0
-; CHECK-I686-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0]
+; CHECK-I686-NEXT: movq {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0]
; CHECK-I686-NEXT: por %xmm2, %xmm0
; CHECK-I686-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
; CHECK-I686-NEXT: pand %xmm3, %xmm0
diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
index 418d632480328fb..bc977e006606e8e 100644
--- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
+++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -375,7 +375,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
; X86-SSE-LABEL: elt5_v8i64:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT: movaps {{.*#+}} xmm2 = [4,0,0,0]
+; X86-SSE-NEXT: movss {{.*#+}} xmm2 = [4,0,0,0]
; X86-SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [42,0,1,0]
; X86-SSE-NEXT: movaps {{.*#+}} xmm1 = [2,0,3,0]
@@ -404,7 +404,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
; X86-AVX1-LABEL: elt5_v8i64:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,0,0]
+; X86-AVX1-NEXT: vmovss {{.*#+}} xmm1 = [4,0,0,0]
; X86-AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X86-AVX1-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1
; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
@@ -421,7 +421,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
; X86-AVX2-LABEL: elt5_v8i64:
; X86-AVX2: # %bb.0:
; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,0,0]
+; X86-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [4,0,0,0]
; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X86-AVX2-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1
; X86-AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
@@ -439,7 +439,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
; X86-AVX512F: # %bb.0:
; X86-AVX512F-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX512F-NEXT: vmovaps {{.*#+}} xmm2 = [4,0,0,0]
+; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm2 = [4,0,0,0]
; X86-AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; X86-AVX512F-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll
index ed7cbb0a8430d93..cfcb8798e0b9cc6 100644
--- a/llvm/test/CodeGen/X86/insertelement-ones.ll
+++ b/llvm/test/CodeGen/X86/insertelement-ones.ll
@@ -280,7 +280,7 @@ define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) {
;
; AVX1-LABEL: insert_v16i16_x12345x789ABCDEx:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = [65535,0,0,0]
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -385,7 +385,7 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
;
; AVX1-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,0,0]
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = [255,0,0,0]
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
index f63545bcd178e5f..aad1b443448503b 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
@@ -739,7 +739,7 @@ define <17 x float> @test_mgather_v17f32(ptr %base, <17 x i32> %index)
; WIDEN_AVX2-NEXT: vgatherdps %ymm5, (%rsi,%ymm1,4), %ymm6
; WIDEN_AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; WIDEN_AVX2-NEXT: vgatherdps %ymm3, (%rsi,%ymm0,4), %ymm1
-; WIDEN_AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0]
+; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm0 = [4294967295,0,0,0]
; WIDEN_AVX2-NEXT: vgatherdps %ymm0, (%rsi,%ymm2,4), %ymm4
; WIDEN_AVX2-NEXT: vmovss %xmm4, 64(%rdi)
; WIDEN_AVX2-NEXT: vmovaps %ymm1, 32(%rdi)
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index f78646afccb3453..8fc157c2edae2a9 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -1395,7 +1395,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; SSE4-LABEL: truncstore_v4i64_v4i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm3, %xmm3
-; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE4-NEXT: movd {{.*#+}} xmm4 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE4-NEXT: pshufb %xmm4, %xmm1
; SSE4-NEXT: pshufb %xmm4, %xmm0
; SSE4-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1435,7 +1435,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm4 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
@@ -1477,7 +1477,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-NEXT: vmovd {{.*#+}} xmm4 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
@@ -3791,7 +3791,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX1-LABEL: truncstore_v8i32_v8i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
@@ -3865,7 +3865,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vmovd {{.*#+}} xmm4 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
index 6eb02bfc1fd0c39..da46ea406557917 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
@@ -947,7 +947,7 @@ define i1 @length24_eq_const(ptr %X) nounwind {
; X64-MIC-AVX: # %bb.0:
; X64-MIC-AVX-NEXT: vmovdqu (%rdi), %xmm0
; X64-MIC-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [959985462,858927408,0,0]
+; X64-MIC-AVX-NEXT: vmovq {{.*#+}} xmm2 = [959985462,858927408,0,0]
; X64-MIC-AVX-NEXT: vpcmpneqd %zmm2, %zmm1, %k0
; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
; X64-MIC-AVX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
index f5e7384362a92b1..83cb0d6f973be5b 100644
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ b/llvm/test/CodeGen/X86/memcmp.ll
@@ -1015,7 +1015,7 @@ define i1 @length24_eq_const(ptr %X) nounwind {
; X64-MIC-AVX: # %bb.0:
; X64-MIC-AVX-NEXT: vmovdqu (%rdi), %xmm0
; X64-MIC-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [959985462,858927408,0,0]
+; X64-MIC-AVX-NEXT: vmovq {{.*#+}} xmm2 = [959985462,858927408,0,0]
; X64-MIC-AVX-NEXT: vpcmpneqd %zmm2, %zmm1, %k0
; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
; X64-MIC-AVX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1
diff --git a/llvm/test/CodeGen/X86/pmaddubsw.ll b/llvm/test/CodeGen/X86/pmaddubsw.ll
index ea0b4e4b21c775d..e46a14673a5171f 100644
--- a/llvm/test/CodeGen/X86/pmaddubsw.ll
+++ b/llvm/test/CodeGen/X86/pmaddubsw.ll
@@ -320,8 +320,7 @@ define <8 x i16> @pmaddubsw_bad_extend(ptr %Aptr, ptr %Bptr) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa (%rsi), %xmm1
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
-; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
@@ -349,9 +348,9 @@ define <8 x i16> @pmaddubsw_bad_extend(ptr %Aptr, ptr %Bptr) {
; AVX256: # %bb.0:
; AVX256-NEXT: vmovdqa (%rdi), %xmm0
; AVX256-NEXT: vmovdqa (%rsi), %xmm1
-; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
+; AVX256-NEXT: vmovq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,0,0,0,0,0,0,0]
; AVX256-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX256-NEXT: vmovq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX256-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX256-NEXT: vpshufb %xmm2, %xmm1, %xmm2
; AVX256-NEXT: vpshufb %xmm4, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/pr46532.ll b/llvm/test/CodeGen/X86/pr46532.ll
index ffaa131d5cbb1b0..cbc677229ede61e 100644
--- a/llvm/test/CodeGen/X86/pr46532.ll
+++ b/llvm/test/CodeGen/X86/pr46532.ll
@@ -7,7 +7,7 @@ define void @WhileWithLoopInvariantOperation.21() {
; CHECK-NEXT: movq (%rax), %rax
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovaps %xmm0, 32(%rax)
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,0,0]
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = [4294967295,4294967295,0,0]
; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax)
while.1.body.preheader:
%0 = load ptr, ptr undef, align 8, !invariant.load !0, !dereferenceable !1, !align !2
diff --git a/llvm/test/CodeGen/X86/pr63108.ll b/llvm/test/CodeGen/X86/pr63108.ll
index 38e45595a8846e4..a0b6d933491f1be 100644
--- a/llvm/test/CodeGen/X86/pr63108.ll
+++ b/llvm/test/CodeGen/X86/pr63108.ll
@@ -11,11 +11,11 @@ define i32 @PR63108() {
; SSE-NEXT: testb %al, %al
; SSE-NEXT: je .LBB0_2
; SSE-NEXT: # %bb.1:
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [251,223,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE-NEXT: movd {{.*#+}} xmm0 = [57339,0,0,0]
; SSE-NEXT: jmp .LBB0_5
; SSE-NEXT: .LBB0_2: # %vector.body.preheader
; SSE-NEXT: pxor %xmm0, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [57339,0,0,0]
+; SSE-NEXT: movd {{.*#+}} xmm1 = [57339,0,0,0]
; SSE-NEXT: xorl %eax, %eax
; SSE-NEXT: .p2align 4, 0x90
; SSE-NEXT: .LBB0_3: # %vector.body
@@ -47,10 +47,10 @@ define i32 @PR63108() {
; AVX1-NEXT: testb %al, %al
; AVX1-NEXT: je .LBB0_2
; AVX1-NEXT: # %bb.1:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [251,223,0,0,251,223,0,0,251,223,0,0,251,223,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = [57339,0,0,0]
; AVX1-NEXT: jmp .LBB0_5
; AVX1-NEXT: .LBB0_2: # %vector.body.preheader
-; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [57339,0,0,0]
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = [57339,0,0,0]
; AVX1-NEXT: xorl %eax, %eax
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB0_3: # %vector.body
@@ -84,10 +84,10 @@ define i32 @PR63108() {
; AVX2-NEXT: testb %al, %al
; AVX2-NEXT: je .LBB0_2
; AVX2-NEXT: # %bb.1:
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [251,223,251,223,251,223,251,223,251,223,251,223,251,223,251,223]
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [57339,0,0,0]
; AVX2-NEXT: jmp .LBB0_5
; AVX2-NEXT: .LBB0_2: # %vector.body.preheader
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [57339,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [57339,0,0,0]
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB0_3: # %vector.body
@@ -121,10 +121,10 @@ define i32 @PR63108() {
; AVX512-NEXT: testb %al, %al
; AVX512-NEXT: je .LBB0_2
; AVX512-NEXT: # %bb.1:
-; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm0 = [251,223,251,223,251,223,251,223,251,223,251,223,251,223,251,223]
+; AVX512-NEXT: vmovd {{.*#+}} xmm0 = [57339,0,0,0]
; AVX512-NEXT: jmp .LBB0_5
; AVX512-NEXT: .LBB0_2: # %vector.body.preheader
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [57339,0,0,0]
+; AVX512-NEXT: vmovd {{.*#+}} xmm0 = [57339,0,0,0]
; AVX512-NEXT: xorl %eax, %eax
; AVX512-NEXT: .p2align 4, 0x90
; AVX512-NEXT: .LBB0_3: # %vector.body
diff --git a/llvm/test/CodeGen/X86/pr74736.ll b/llvm/test/CodeGen/X86/pr74736.ll
index 3dfdbf102c953e3..1c3b4bd4971c11e 100644
--- a/llvm/test/CodeGen/X86/pr74736.ll
+++ b/llvm/test/CodeGen/X86/pr74736.ll
@@ -6,7 +6,7 @@ define void @main(<16 x i32> %0, i32 %1) {
; SSE-LABEL: main:
; SSE: # %bb.0: # %entry
; SSE-NEXT: movd %edi, %xmm4
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,0,0,0]
+; SSE-NEXT: movss {{.*#+}} xmm0 = [1,0,0,0]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,0]
; SSE-NEXT: paddd %xmm0, %xmm0
; SSE-NEXT: paddd %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll
index bbe46a99ffa414d..5f13e974874351a 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll
@@ -38,7 +38,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) {
define <16 x i8> @testv16i8(<16 x i8> %in) {
; AVX256-LABEL: testv16i8:
; AVX256: # %bb.0:
-; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX256-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX256-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX256-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX256-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pshufb-mask-comments.ll b/llvm/test/CodeGen/X86/pshufb-mask-comments.ll
index ef91e7a3f910753..b96338984d6f551 100644
--- a/llvm/test/CodeGen/X86/pshufb-mask-comments.ll
+++ b/llvm/test/CodeGen/X86/pshufb-mask-comments.ll
@@ -54,7 +54,7 @@ define <16 x i8> @test4(<16 x i8> %V, ptr %P) {
define <16 x i8> @test5(<16 x i8> %V) {
; CHECK-LABEL: test5:
; CHECK: # %bb.0:
-; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1,0,0,0]
+; CHECK-NEXT: movss {{.*#+}} xmm1 = [1,0,0,0]
; CHECK-NEXT: movaps %xmm1, (%rax)
; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1,1]
; CHECK-NEXT: movaps %xmm1, (%rax)
diff --git a/llvm/test/CodeGen/X86/ret-mmx.ll b/llvm/test/CodeGen/X86/ret-mmx.ll
index 815f95e64496b06..81dd73363c1fb1f 100644
--- a/llvm/test/CodeGen/X86/ret-mmx.ll
+++ b/llvm/test/CodeGen/X86/ret-mmx.ll
@@ -32,7 +32,7 @@ define <1 x i64> @t2() nounwind {
define <2 x i32> @t3() nounwind {
; CHECK-LABEL: t3:
; CHECK: ## %bb.0:
-; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1,0,0,0]
+; CHECK-NEXT: movss {{.*#+}} xmm0 = [1,0,0,0]
; CHECK-NEXT: retq
ret <2 x i32> <i32 1, i32 0>
}
diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
index 3d3e935045475e5..d043234705aa06c 100644
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -544,7 +544,7 @@ define dso_local i32 @sad_2i8() nounwind {
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0]
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB3_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/sext-vsetcc.ll b/llvm/test/CodeGen/X86/sext-vsetcc.ll
index 839c972b23dbab3..de9d47526b166bf 100644
--- a/llvm/test/CodeGen/X86/sext-vsetcc.ll
+++ b/llvm/test/CodeGen/X86/sext-vsetcc.ll
@@ -216,7 +216,7 @@ define <4 x i32> @cmp_ult_load_const(ptr %x) nounwind {
; SSE-LABEL: cmp_ult_load_const:
; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [42,214,0,255,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE-NEXT: movd {{.*#+}} xmm1 = [42,214,0,255,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE-NEXT: pmaxub %xmm0, %xmm1
; SSE-NEXT: pcmpeqb %xmm0, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -243,7 +243,7 @@ define <3 x i32> @cmp_ult_load_const_bad_type(ptr %x) nounwind {
; SSE-LABEL: cmp_ult_load_const_bad_type:
; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [42,214,0,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE-NEXT: movd {{.*#+}} xmm1 = [42,214,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE-NEXT: pmaxub %xmm0, %xmm1
; SSE-NEXT: pcmpeqb %xmm0, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 8dd06e0d848d328..2610f4322c8e2b1 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1745,7 +1745,7 @@ define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) {
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,u,u,u,u,u,u]
+; X86-SSE-NEXT: movd {{.*#+}} xmm1 = [0,65535,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm0, %xmm2
; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2
; X86-SSE-NEXT: pmullw %xmm1, %xmm0
@@ -1768,7 +1768,7 @@ define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) {
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq c(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,u,u,u,u,u,u]
+; X64-SSE-NEXT: movd {{.*#+}} xmm1 = [0,65535,0,0,0,0,0,0]
; X64-SSE-NEXT: movdqa %xmm0, %xmm2
; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2
; X64-SSE-NEXT: pmullw %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/shuffle-half.ll b/llvm/test/CodeGen/X86/shuffle-half.ll
index 0d27fc389676689..291fe841043ed42 100644
--- a/llvm/test/CodeGen/X86/shuffle-half.ll
+++ b/llvm/test/CodeGen/X86/shuffle-half.ll
@@ -10,7 +10,7 @@ define <32 x half> @dump_vec() {
; CHECK-NEXT: jne .LBB0_2
; CHECK-NEXT: # %bb.1: # %cond.load
; CHECK-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm0
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
index 42f1bd7824909bf..9f21e3cdfd350bd 100644
--- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
@@ -12,34 +12,22 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v16i8_1:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX1-NEXT: # xmm2 = mem[0,0]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v16i8_1:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v16i8_1:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v16i8_1:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -50,7 +38,7 @@ define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -147,27 +135,16 @@ define void @shuffle_v8i32_to_v4i32_1(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v8i8_1:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v8i8_1:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vmovq %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
; AVX512F: # %bb.0:
@@ -207,27 +184,16 @@ define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v8i8_2:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v8i8_2:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vmovq %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
; AVX512F: # %bb.0:
@@ -267,27 +233,16 @@ define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v8i8_3(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v8i8_3:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v8i8_3:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vmovq %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
; AVX512F: # %bb.0:
@@ -534,27 +489,16 @@ define void @shuffle_v16i16_to_v4i16_3(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v4i8_1:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vmovd %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v4i8_1:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v4i8_1:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vmovd %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1:
; AVX512F: # %bb.0:
@@ -594,27 +538,16 @@ define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v4i8_2:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vmovd %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v4i8_2:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v4i8_2:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vmovd %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2:
; AVX512F: # %bb.0:
@@ -654,27 +587,16 @@ define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v4i8_3:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vmovd %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v4i8_3:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v4i8_3:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vmovd %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3:
; AVX512F: # %bb.0:
@@ -714,27 +636,16 @@ define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v4i8_4:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vmovd %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v4i8_4:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v4i8_4:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vmovd %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4:
; AVX512F: # %bb.0:
@@ -774,27 +685,16 @@ define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v4i8_5:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vmovd %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v4i8_5:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v4i8_5:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vmovd %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5:
; AVX512F: # %bb.0:
@@ -834,27 +734,16 @@ define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v4i8_6:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vmovd %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v4i8_6:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v4i8_6:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vmovd %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6:
; AVX512F: # %bb.0:
@@ -894,27 +783,16 @@ define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v4i8_7(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v4i8_7:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vmovd %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v4i8_7:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v4i8_7:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vmovd %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7:
; AVX512F: # %bb.0:
@@ -953,3 +831,5 @@ define void @shuffle_v32i8_to_v4i8_7(ptr %L, ptr %S) nounwind {
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX2: {{.*}}
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index 0ab141f4c2022b0..b4a5b29040310f3 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -382,27 +382,16 @@ define void @trunc_v4i64_to_v4i32(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v8i8(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v8i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vmovq %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
; AVX512F: # %bb.0:
@@ -447,27 +436,16 @@ define void @shuffle_v32i8_to_v8i8(ptr %L, ptr %S) nounwind {
}
define void @trunc_v8i32_to_v8i8(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: trunc_v8i32_to_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_v8i32_to_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: trunc_v8i32_to_v8i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vmovq %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_v8i32_to_v8i8:
; AVX512F: # %bb.0:
@@ -519,7 +497,7 @@ define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -530,7 +508,7 @@ define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -692,7 +670,7 @@ define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -703,7 +681,7 @@ define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -1021,7 +999,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1033,7 +1011,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
; AVX2-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1190,27 +1168,16 @@ define void @trunc_v4i64_to_v4i16(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v4i8(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v4i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vmovd %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v4i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: shuffle_v32i8_to_v4i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vmovd %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
; AVX512F: # %bb.0:
@@ -1255,27 +1222,16 @@ define void @shuffle_v32i8_to_v4i8(ptr %L, ptr %S) nounwind {
}
define void @trunc_v4i64_to_v4i8(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: trunc_v4i64_to_v4i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vmovd %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_v4i64_to_v4i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-NEXT: retq
+; AVX-LABEL: trunc_v4i64_to_v4i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vmovd %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i8:
; AVX512F: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 6e357a5fb34f502..85e160e497172a7 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -392,7 +392,7 @@ define <4 x double> @PR34175(ptr %p) {
;
; AVX512BWVL-LABEL: PR34175:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
+; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm1
; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -411,7 +411,7 @@ define <4 x double> @PR34175(ptr %p) {
;
; AVX512VBMIVL-LABEL: PR34175:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
+; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm1
; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll
index 348501caf619a3c..777beea886f566a 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -834,43 +834,43 @@ declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind rea
define <16 x i8> @test_x86_sse2_packsswb_128_fold() {
; X86-SSE-LABEL: test_x86_sse2_packsswb_128_fold:
; X86-SSE: ## %bb.0:
-; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X86-SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A]
-; X86-SSE-NEXT: ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: ## encoding: [0xf2,0x0f,0x10,0x05,A,A,A,A]
+; X86-SSE-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-SSE-NEXT: retl ## encoding: [0xc3]
;
; X86-AVX1-LABEL: test_x86_sse2_packsswb_128_fold:
; X86-AVX1: ## %bb.0:
-; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X86-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X86-AVX1-NEXT: ## encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A]
; X86-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX1-NEXT: retl ## encoding: [0xc3]
;
; X86-AVX512-LABEL: test_x86_sse2_packsswb_128_fold:
; X86-AVX512: ## %bb.0:
-; X86-AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A]
; X86-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX512-NEXT: retl ## encoding: [0xc3]
;
; X64-SSE-LABEL: test_x86_sse2_packsswb_128_fold:
; X64-SSE: ## %bb.0:
-; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X64-SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A]
-; X64-SSE-NEXT: ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X64-SSE-NEXT: ## encoding: [0xf2,0x0f,0x10,0x05,A,A,A,A]
+; X64-SSE-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-SSE-NEXT: retq ## encoding: [0xc3]
;
; X64-AVX1-LABEL: test_x86_sse2_packsswb_128_fold:
; X64-AVX1: ## %bb.0:
-; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X64-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X64-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X64-AVX1-NEXT: ## encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A]
; X64-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX1-NEXT: retq ## encoding: [0xc3]
;
; X64-AVX512-LABEL: test_x86_sse2_packsswb_128_fold:
; X64-AVX512: ## %bb.0:
-; X64-AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A]
; X64-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX512-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <8 x i16> zeroinitializer)
@@ -902,43 +902,43 @@ declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind rea
define <16 x i8> @test_x86_sse2_packuswb_128_fold() {
; X86-SSE-LABEL: test_x86_sse2_packuswb_128_fold:
; X86-SSE: ## %bb.0:
-; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X86-SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A]
-; X86-SSE-NEXT: ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A]
+; X86-SSE-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-SSE-NEXT: retl ## encoding: [0xc3]
;
; X86-AVX1-LABEL: test_x86_sse2_packuswb_128_fold:
; X86-AVX1: ## %bb.0:
-; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X86-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X86-AVX1-NEXT: ## encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
; X86-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX1-NEXT: retl ## encoding: [0xc3]
;
; X86-AVX512-LABEL: test_x86_sse2_packuswb_128_fold:
; X86-AVX512: ## %bb.0:
-; X86-AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X86-AVX512-NEXT: vmovss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
; X86-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX512-NEXT: retl ## encoding: [0xc3]
;
; X64-SSE-LABEL: test_x86_sse2_packuswb_128_fold:
; X64-SSE: ## %bb.0:
-; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X64-SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A]
-; X64-SSE-NEXT: ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-SSE-NEXT: movss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X64-SSE-NEXT: ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A]
+; X64-SSE-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-SSE-NEXT: retq ## encoding: [0xc3]
;
; X64-AVX1-LABEL: test_x86_sse2_packuswb_128_fold:
; X64-AVX1: ## %bb.0:
-; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X64-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X64-AVX1-NEXT: vmovss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X64-AVX1-NEXT: ## encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
; X64-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX1-NEXT: retq ## encoding: [0xc3]
;
; X64-AVX512-LABEL: test_x86_sse2_packuswb_128_fold:
; X64-AVX512: ## %bb.0:
-; X64-AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X64-AVX512-NEXT: vmovss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
; X64-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX512-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <8 x i16> zeroinitializer)
diff --git a/llvm/test/CodeGen/X86/vec_anyext.ll b/llvm/test/CodeGen/X86/vec_anyext.ll
index cdd30165a99bc31..883876a26c0f163 100644
--- a/llvm/test/CodeGen/X86/vec_anyext.ll
+++ b/llvm/test/CodeGen/X86/vec_anyext.ll
@@ -173,7 +173,7 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vmovdqa (%ecx), %xmm0
; X86-NEXT: vmovdqa 16(%ecx), %xmm1
-; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; X86-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; X86-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; X86-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -189,7 +189,7 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind {
; X64: # %bb.0:
; X64-NEXT: vmovdqa (%rdi), %xmm0
; X64-NEXT: vmovdqa 16(%rdi), %xmm1
-; X64-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; X64-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; X64-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; X64-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -211,8 +211,7 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind {
define <4 x i16> @const_16_32() nounwind {
; CHECK-LABEL: const_16_32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [0,3,8,7,0,3,8,7]
-; CHECK-NEXT: # xmm0 = mem[0,0]
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = [0,3,8,7,0,0,0,0]
; CHECK-NEXT: ret{{[l|q]}}
%G = trunc <4 x i32> <i32 0, i32 3, i32 8, i32 7> to <4 x i16>
ret <4 x i16> %G
@@ -221,8 +220,7 @@ define <4 x i16> @const_16_32() nounwind {
define <4 x i16> @const_16_64() nounwind {
; CHECK-LABEL: const_16_64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [0,3,8,7,0,3,8,7]
-; CHECK-NEXT: # xmm0 = mem[0,0]
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = [0,3,8,7,0,0,0,0]
; CHECK-NEXT: ret{{[l|q]}}
%G = trunc <4 x i64> <i64 0, i64 3, i64 8, i64 7> to <4 x i16>
ret <4 x i16> %G
diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
index 1d81cb377308007..a0e9f33483b69cf 100644
--- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
@@ -1907,13 +1907,12 @@ define <2 x i64> @fptosi_2f64_to_2i64_const() {
define <4 x i32> @fptosi_2f64_to_2i32_const() {
; SSE-LABEL: fptosi_2f64_to_2i32_const:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,1,u,u]
+; SSE-NEXT: movsd {{.*#+}} xmm0 = [4294967295,1,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f64_to_2i32_const:
; AVX: # %bb.0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,1,4294967295,1]
-; AVX-NEXT: # xmm0 = mem[0,0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4294967295,1,0,0]
; AVX-NEXT: retq
%cvt = fptosi <2 x double> <double -1.0, double 1.0> to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -1966,13 +1965,12 @@ define <2 x i64> @fptoui_2f64_to_2i64_const() {
define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) {
; SSE-LABEL: fptoui_2f64_to_2i32_const:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4,u,u]
+; SSE-NEXT: movsd {{.*#+}} xmm0 = [2,4,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: fptoui_2f64_to_2i32_const:
; AVX: # %bb.0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [2,4,2,4]
-; AVX-NEXT: # xmm0 = mem[0,0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [2,4,0,0]
; AVX-NEXT: retq
%cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/X86/vec_set-A.ll b/llvm/test/CodeGen/X86/vec_set-A.ll
index c8ff250b5bfbcd6..a288579bda34eaf 100644
--- a/llvm/test/CodeGen/X86/vec_set-A.ll
+++ b/llvm/test/CodeGen/X86/vec_set-A.ll
@@ -5,12 +5,12 @@
define <2 x i64> @test1() nounwind {
; X86-LABEL: test1:
; X86: # %bb.0:
-; X86-NEXT: movaps {{.*#+}} xmm0 = [1,0,0,0]
+; X86-NEXT: movss {{.*#+}} xmm0 = [1,0,0,0]
; X86-NEXT: retl
;
; X64-LABEL: test1:
; X64: # %bb.0:
-; X64-NEXT: movaps {{.*#+}} xmm0 = [1,0,0,0]
+; X64-NEXT: movss {{.*#+}} xmm0 = [1,0,0,0]
; X64-NEXT: retq
ret <2 x i64> < i64 1, i64 0 >
}
diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll
index 73c2f4ca5b10bf1..bd5c9363794aa16 100644
--- a/llvm/test/CodeGen/X86/vector-blend.ll
+++ b/llvm/test/CodeGen/X86/vector-blend.ll
@@ -79,22 +79,16 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
; SSE41-LABEL: vsel_4xi8:
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: movss {{.*#+}} xmm0 = [255,255,0,255,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: vsel_4xi8:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: vsel_4xi8:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255]
-; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: vsel_4xi8:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [255,255,0,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
entry:
%vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
ret <4 x i8> %vsel
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index acf45fc4bbeba4f..0adb9ddfc426a8d 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -4379,7 +4379,7 @@ define <2 x i32> @constrained_vector_fptoui_v2i32_v2f32() #0 {
;
; AVX512-LABEL: constrained_vector_fptoui_v2i32_v2f32:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4.2E+1,4.3E+1,0.0E+0,0.0E+0]
+; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = [4.2E+1,4.3E+1,0.0E+0,0.0E+0]
; AVX512-NEXT: vcvttps2udq %zmm0, %zmm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index 4f100cd3e05309b..550b2e06554385f 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -1212,7 +1212,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
; SSE-LABEL: splatvar_funnnel_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0]
+; SSE-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0]
; SSE-NEXT: movdqa %xmm2, %xmm4
; SSE-NEXT: pandn %xmm3, %xmm4
; SSE-NEXT: psrlw $1, %xmm1
@@ -1224,7 +1224,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX-LABEL: splatvar_funnnel_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
@@ -1235,7 +1235,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512F-LABEL: splatvar_funnnel_v8i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
@@ -1246,7 +1246,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512VL-LABEL: splatvar_funnnel_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
@@ -1257,7 +1257,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512BW-LABEL: splatvar_funnnel_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
@@ -1278,7 +1278,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
@@ -1295,7 +1295,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; XOP-LABEL: splatvar_funnnel_v8i16:
; XOP: # %bb.0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; XOP-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4
; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1
; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
@@ -1306,7 +1306,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; X86-SSE2-LABEL: splatvar_funnnel_v8i16:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0]
+; X86-SSE2-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0]
; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
; X86-SSE2-NEXT: pandn %xmm3, %xmm4
; X86-SSE2-NEXT: psrlw $1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 05be4e1ee928e43..683fdf15cdea41e 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -1000,7 +1000,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm5
@@ -1088,7 +1088,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; XOPAVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; XOPAVX1-NEXT: vpsrlw $1, %xmm5, %xmm5
@@ -1278,7 +1278,7 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) {
; AVX1-NEXT: vmovd %ecx, %xmm2
; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,0,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [31,0,0,0]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: .p2align 4, 0x90
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index 9ddd171b4db6900..6a8d9d73f138b4c 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -945,7 +945,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; SSE41-LABEL: splatvar_funnnel_v8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,0,0,0]
+; SSE41-NEXT: movd {{.*#+}} xmm2 = [15,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm3
; SSE41-NEXT: pandn %xmm2, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm4
@@ -958,7 +958,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX-LABEL: splatvar_funnnel_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
@@ -969,7 +969,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX512F-LABEL: splatvar_funnnel_v8i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512F-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512F-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
@@ -980,7 +980,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX512VL-LABEL: splatvar_funnnel_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512VL-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
@@ -991,7 +991,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX512BW-LABEL: splatvar_funnnel_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512BW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
@@ -1002,7 +1002,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index 58719e6bd8e0c3c..6fc95cc7780ff4d 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -757,7 +757,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
index 825ca727b624eaf..3452b33ada2a9a0 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
@@ -324,7 +324,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512F-LABEL: constant_funnnel_v2i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0]
; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
@@ -338,7 +338,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512BW-LABEL: constant_funnnel_v2i32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0]
; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -352,7 +352,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512VBMI2-LABEL: constant_funnnel_v2i32:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0]
; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
index 0b6361ffd4fae37..64deaf0e75966e8 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
@@ -390,7 +390,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4,5,4,5]
+; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm2 = [4,5,0,0]
; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 1f51f02a197ac40..70e3a0251674398 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1337,7 +1337,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
; SSE-LABEL: splatvar_funnnel_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0]
+; SSE-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0]
; SSE-NEXT: movdqa %xmm2, %xmm4
; SSE-NEXT: pand %xmm3, %xmm4
; SSE-NEXT: psrlw %xmm4, %xmm1
@@ -1349,7 +1349,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX-LABEL: splatvar_funnnel_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -1360,7 +1360,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512F-LABEL: splatvar_funnnel_v8i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -1371,7 +1371,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512VL-LABEL: splatvar_funnnel_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -1382,7 +1382,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512BW-LABEL: splatvar_funnnel_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -1403,7 +1403,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -1421,7 +1421,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; XOP-LABEL: splatvar_funnnel_v8i16:
; XOP: # %bb.0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; XOP-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -1432,7 +1432,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; X86-SSE2-LABEL: splatvar_funnnel_v8i16:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0]
+; X86-SSE2-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0]
; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
; X86-SSE2-NEXT: pand %xmm3, %xmm4
; X86-SSE2-NEXT: psrlw %xmm4, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 1a6ecea596563ed..61aea6ad4d5955e 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -1032,7 +1032,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5
@@ -1121,7 +1121,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; XOPAVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0]
; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 402eb73e18101bd..3fa9994312e4549 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -982,7 +982,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; SSE41-LABEL: splatvar_funnnel_v8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,0,0,0]
+; SSE41-NEXT: movd {{.*#+}} xmm2 = [15,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm3
; SSE41-NEXT: pand %xmm2, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm4
@@ -995,7 +995,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX-LABEL: splatvar_funnnel_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX-NEXT: vpsrlw %xmm3, %xmm0, %xmm3
; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm1
@@ -1006,7 +1006,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX512F-LABEL: splatvar_funnnel_v8i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512F-NEXT: vpsrlw %xmm3, %xmm0, %xmm3
; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1
@@ -1017,7 +1017,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX512VL-LABEL: splatvar_funnnel_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpsrlw %xmm3, %xmm0, %xmm3
; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1
@@ -1028,7 +1028,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX512BW-LABEL: splatvar_funnnel_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpsrlw %xmm3, %xmm0, %xmm3
; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1
@@ -1039,7 +1039,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm0, %xmm3
; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index bb311468ce913cb..b2047a04f163e6b 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -796,7 +796,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpsrlw %xmm3, %xmm4, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
index b8c356711921d0d..d78aa4e049e0a32 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
@@ -338,7 +338,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512F-LABEL: constant_funnnel_v2i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0]
; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
@@ -352,7 +352,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512BW-LABEL: constant_funnnel_v2i32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0]
; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -366,7 +366,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512VBMI2-LABEL: constant_funnnel_v2i32:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0]
; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
index 56896927e7e5ad9..1add344e3e41fbe 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
@@ -454,7 +454,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4,5,4,5]
+; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm2 = [4,5,0,0]
; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 21533818bac11a9..f59960f06f4a118 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -3173,7 +3173,7 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
; AVX512F-NEXT: callq __truncdfhf2 at PLT
; AVX512F-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [16,0,0,0]
+; AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [16,0,0,0]
; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512F-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -3195,7 +3195,7 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
; AVX512-FASTLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-FASTLANE-NEXT: callq __truncdfhf2 at PLT
; AVX512-FASTLANE-NEXT: vpbroadcastw %xmm0, %xmm1
-; AVX512-FASTLANE-NEXT: vmovaps {{.*#+}} xmm0 = [4,0,0,0]
+; AVX512-FASTLANE-NEXT: vmovss {{.*#+}} xmm0 = [4,0,0,0]
; AVX512-FASTLANE-NEXT: vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
; AVX512-FASTLANE-NEXT: addq $40, %rsp
; AVX512-FASTLANE-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index d2bef6b234e3833..47aab9e264c19a5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -240,11 +240,11 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512BW-SLOW-LABEL: load_i16_stride3_vf4:
; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,3,6,9,0,3,6,9]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0]
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm1
; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512BW-SLOW-NEXT: vpermi2w %xmm2, %xmm1, %xmm0
-; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,7,10,1,4,7,10]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpermi2w %xmm2, %xmm1, %xmm3
; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
@@ -257,13 +257,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512BW-FAST-LABEL: load_i16_stride3_vf4:
; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,3,6,9,0,3,6,9]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0]
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm1
; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512BW-FAST-NEXT: vpermi2w %xmm2, %xmm1, %xmm0
-; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,7,10,1,4,7,10]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0]
; AVX512BW-FAST-NEXT: vpermi2w %xmm2, %xmm1, %xmm3
-; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,5,8,11,2,5,8,11]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm4 = [2,5,8,11,0,0,0,0]
; AVX512BW-FAST-NEXT: vpermi2w %xmm2, %xmm1, %xmm4
; AVX512BW-FAST-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-FAST-NEXT: vmovq %xmm3, (%rdx)
@@ -967,8 +967,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm11
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm15
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm11[3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1807,8 +1806,7 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm13[2],xmm8[3,4],xmm13[5],xmm8[6,7]
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm14
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm13
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm14[3,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index 904f8ddd2e7f443..0d5445dc194cec5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -409,22 +409,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512BW-LABEL: load_i16_stride5_vf4:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,6,11,0,1,6,11,0]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,5,10,0,0,5,10,0]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512BW-NEXT: vpextrw $7, %xmm2, %eax
; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,7,12,17,2,7,12,17]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0]
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4
; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm2
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,8,13,18,3,8,13,18]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,8,13,18,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm5
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,9,14,19,4,9,14,19]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,19,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm6
; AVX512BW-NEXT: vmovq %xmm1, (%rsi)
; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
@@ -7449,7 +7449,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm15
; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm7
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3]
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,6,7,0,1,10,11,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [1,3,6,0,5,u,u,u]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm9
@@ -7482,7 +7482,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3]
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15]
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm3
; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,4,6,3,6,u,u,u]
; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm27, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index fffe46d3cbed1d6..3efae5c1150456a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -213,7 +213,7 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
; AVX512BW-FAST-NEXT: vpbroadcastw 4(%rdi), %xmm4
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,9,3,9,3,9,3,9]
+; AVX512BW-FAST-NEXT: vmovd {{.*#+}} xmm5 = [3,9,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpermi2w %xmm1, %xmm0, %xmm5
; AVX512BW-FAST-NEXT: vpbroadcastw 20(%rdi), %xmm6
; AVX512BW-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm7
@@ -501,19 +501,19 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-LABEL: load_i16_stride6_vf4:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,12,18,0,6,12,18]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,7,13,19,1,7,13,19]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,7,13,19,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,8,14,20,2,8,14,20]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,8,14,20,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,9,15,21,3,9,15,21]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,9,15,21,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,10,16,22,4,10,16,22]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,10,16,22,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,11,17,23,5,11,17,23]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,11,17,23,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index 4b32647f6fb3e79..853f94c84d1a8e5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -240,7 +240,7 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpsrlq $48, %xmm1, %xmm8
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX512BW-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [6,13,6,13,6,13,6,13]
+; AVX512BW-FAST-NEXT: vmovd {{.*#+}} xmm8 = [6,13,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpermi2w %xmm1, %xmm0, %xmm8
; AVX512BW-FAST-NEXT: vmovd %xmm2, (%rsi)
; AVX512BW-FAST-NEXT: vmovd %xmm4, (%rdx)
@@ -658,21 +658,21 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,7,14,21,0,7,14,21]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,8,15,22,1,8,15,22]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,9,16,23,2,9,16,23]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,10,17,24,3,10,17,24]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,11,18,25,4,11,18,25]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,12,19,26,5,12,19,26]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [6,13,20,27,6,13,20,27]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
@@ -1343,7 +1343,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX512F-FAST-NEXT: vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm12
; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10
; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
@@ -2380,7 +2380,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm12[1,2,3,4,5,6,7],ymm11[8],ymm12[9,10,11,12,13,14,15]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm12 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm15
; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14
; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7]
@@ -2546,7 +2546,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm14
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm13
; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
@@ -5161,7 +5161,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5,6,7],ymm6[8,9,10,11,12],ymm1[13,14,15]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15
-; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm2 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3]
@@ -5496,7 +5496,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm14
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2
@@ -5566,7 +5566,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7]
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm14
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
@@ -11374,7 +11374,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm4 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm3
; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm10
; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1
@@ -11431,7 +11431,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3
; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
@@ -11493,7 +11493,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7]
; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm4 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
@@ -11522,7 +11522,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm13
; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm6
-; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm2 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm12
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
@@ -12194,7 +12194,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm6 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm6 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm12
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2
@@ -12324,9 +12324,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm13 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm8
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm4 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
index 78065bc73c1d3f3..054afe2dfdf2ede 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
@@ -271,23 +271,23 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,9,17,25,1,9,17,25]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,10,18,26,2,10,18,26]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,11,19,27,3,11,19,27]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,12,20,28,4,12,20,28]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,13,21,29,5,13,21,29]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [6,14,22,30,6,14,22,30]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [7,15,23,31,7,15,23,31]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm9
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
@@ -560,7 +560,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
-; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm15 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vpermt2d %xmm13, %xmm15, %xmm14
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
@@ -615,7 +615,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm9 = [1,5,0,0]
; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm2
; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm9, %xmm2
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
@@ -626,7 +626,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpermt2d %xmm10, %xmm1, %xmm0
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [3,7,3,7]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm10 = [3,7,0,0]
; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm10, %xmm15
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm14
@@ -1353,7 +1353,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19
-; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm17 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vpermt2d %xmm4, %xmm17, %xmm14
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3]
; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
@@ -1490,7 +1490,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm15 = [1,5,0,0]
; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm1
; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm15, %xmm1
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
@@ -1523,7 +1523,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm18 = [3,7,3,7]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm18 = [3,7,0,0]
; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm18, %xmm11
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3]
; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
@@ -3238,7 +3238,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vpermt2d %xmm11, %xmm0, %xmm7
; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm6
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0
@@ -3409,7 +3409,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1
-; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [3,7,3,7]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm12 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vpermt2d %xmm16, %xmm12, %xmm4
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3]
@@ -3563,7 +3563,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,5,1,5]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = [1,5,0,0]
; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0
; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm1, %xmm0
; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm11
@@ -3630,7 +3630,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm27 = [3,7,3,7]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm27 = [3,7,0,0]
; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm27, %xmm9
; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm0
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
@@ -3733,7 +3733,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31
; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm0
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,5,1,5]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = [1,5,0,0]
; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm4, %xmm0
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
@@ -7297,7 +7297,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm1
; AVX512F-SLOW-NEXT: vpermt2d %xmm16, %xmm0, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16
@@ -7711,7 +7711,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
-; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm16 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1
; AVX512F-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload
; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -8050,7 +8050,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm9 = [1,5,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm0
; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm9, %xmm0
; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm30
@@ -8225,7 +8225,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1
; AVX512F-FAST-NEXT: vpermt2d %xmm28, %xmm0, %xmm1
; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16
@@ -8477,7 +8477,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0
; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm15 = [1,5,0,0]
; AVX512F-FAST-NEXT: vpermt2d %xmm19, %xmm15, %xmm0
; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1]
@@ -8642,7 +8642,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm17 = [3,7,0,0]
; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; AVX512F-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload
; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index 4ed9a99c58c3f1b..cce2340d214aa64 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -78,7 +78,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [5,0,5,0]
+; AVX512-FAST-NEXT: vmovd {{.*#+}} xmm3 = [5,0,0,0]
; AVX512-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm3
; AVX512-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm0
; AVX512-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
index 40355e3f7686f66..30ac7dc443182e4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
@@ -78,7 +78,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
+; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0]
; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-FAST-NEXT: vmovq %xmm2, (%rsi)
@@ -160,8 +160,7 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5]
-; AVX2-ONLY-NEXT: # xmm7 = mem[0,0]
+; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm7 = [1,5,0,0]
; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
@@ -171,8 +170,7 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [3,7,3,7]
-; AVX2-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = [3,7,0,0]
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm3
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsi)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index d3f9b1c4e15a72a..317875ca0ba7a05 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -86,12 +86,10 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
-; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [4,2,4,2]
-; AVX2-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm3, %ymm3
-; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
-; AVX2-ONLY-NEXT: # xmm6 = mem[0,0]
+; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0]
; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm6, %ymm0
; AVX2-ONLY-NEXT: vmovlps %xmm4, (%rsi)
; AVX2-ONLY-NEXT: vmovlps %xmm2, (%rdx)
@@ -118,13 +116,11 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-SLOW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = [4,2,4,2]
-; AVX512-SLOW-NEXT: # xmm2 = mem[0,0]
+; AVX512-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = [4,2,0,0]
; AVX512-SLOW-NEXT: vmovaps 32(%rdi), %ymm5
; AVX512-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
; AVX512-SLOW-NEXT: vpermps %ymm5, %ymm2, %ymm2
-; AVX512-SLOW-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
-; AVX512-SLOW-NEXT: # xmm6 = mem[0,0]
+; AVX512-SLOW-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0]
; AVX512-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
; AVX512-SLOW-NEXT: vmovq %xmm3, (%rsi)
; AVX512-SLOW-NEXT: vmovq %xmm1, (%rdx)
@@ -147,15 +143,13 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm4
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4]
; AVX512-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2
-; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,5,3,5]
+; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm5 = [3,5,0,0]
; AVX512-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5
-; AVX512-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
-; AVX512-FAST-NEXT: # xmm1 = mem[0,0]
+; AVX512-FAST-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0]
; AVX512-FAST-NEXT: vmovaps 32(%rdi), %ymm3
; AVX512-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
; AVX512-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1
-; AVX512-FAST-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
-; AVX512-FAST-NEXT: # xmm6 = mem[0,0]
+; AVX512-FAST-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0]
; AVX512-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3
; AVX512-FAST-NEXT: vmovq %xmm0, (%rsi)
; AVX512-FAST-NEXT: vmovq %xmm4, (%rdx)
@@ -308,13 +302,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2]
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm9 = [4,2,4,2]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [4,2,0,0]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm2
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3]
; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3]
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,3,5,3]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm7 = [5,3,0,0]
; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm7, %ymm1
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsi)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
index e4f94b368b7fd53..abd4525a677375e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
@@ -100,8 +100,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
-; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [4,3,4,3]
-; AVX2-ONLY-NEXT: # xmm4 = mem[0,0]
+; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
@@ -134,7 +133,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,11,4,11]
+; AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [4,11,0,0]
; AVX512-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5
; AVX512-SLOW-NEXT: vmovdqa (%rdi), %ymm6
; AVX512-SLOW-NEXT: vpermi2d %ymm5, %ymm6, %ymm1
@@ -165,9 +164,9 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm4
; AVX512-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [7,2,7,2]
+; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm5 = [7,2,0,0]
; AVX512-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm5
-; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,11,4,11]
+; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm0 = [4,11,0,0]
; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512-FAST-NEXT: vmovdqa (%rdi), %ymm6
; AVX512-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
@@ -355,8 +354,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
-; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3]
-; AVX2-SLOW-NEXT: # xmm10 = mem[0,0]
+; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm10 = [4,3,0,0]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
@@ -414,8 +412,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
-; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3]
-; AVX2-FAST-NEXT: # xmm10 = mem[0,0]
+; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm10 = [4,3,0,0]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
@@ -472,8 +469,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
-; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3]
-; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[0,0]
+; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm10 = [4,3,0,0]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
@@ -850,7 +846,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm10
; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm11
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
@@ -954,7 +950,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm10
; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm11
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
@@ -1058,7 +1054,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm10
; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm11
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
@@ -1880,7 +1876,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm5 = [4,3,0,0]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12
; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm5, %ymm10
@@ -2104,7 +2100,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm0
; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm1
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [4,3,4,3]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = [4,3,0,0]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm11
; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm12
@@ -2332,7 +2328,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm3
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm5 = [4,3,0,0]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm12
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm5, %ymm10
@@ -4253,7 +4249,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [4,3,0,0]
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
@@ -4763,7 +4759,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,3,4,3]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = [4,3,0,0]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
@@ -5274,7 +5270,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = [4,3,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
@@ -9154,8 +9150,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm7 = [4,3,4,3]
-; AVX2-SLOW-NEXT: # xmm7 = mem[0,0]
+; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm7 = [4,3,0,0]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
@@ -10195,8 +10190,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm0
; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm9
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm4 = [4,3,4,3]
-; AVX2-FAST-NEXT: # xmm4 = mem[0,0]
+; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
@@ -11245,8 +11239,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm7 = [4,3,4,3]
-; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[0,0]
+; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm7 = [4,3,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
index faa1642831c1553..8f0c4b9b726a5e3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
@@ -153,7 +153,7 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
+; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0]
; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
index 88ebda3622cc9b4..ec029c3f9c033bb 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
@@ -182,8 +182,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX1-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm3
; AVX1-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
@@ -199,7 +198,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX2-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm3
; AVX2-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX2-ONLY-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
@@ -215,7 +214,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX512F-NEXT: vpand %xmm0, %xmm2, %xmm3
; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
@@ -281,8 +280,7 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX1-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm6
; AVX1-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vpackuswb %xmm6, %xmm0, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
@@ -438,8 +436,7 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX1-ONLY-NEXT: vpand %xmm1, %xmm8, %xmm12
; AVX1-ONLY-NEXT: vpand %xmm1, %xmm7, %xmm1
; AVX1-ONLY-NEXT: vpackuswb %xmm12, %xmm1, %xmm1
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
index 40894a14a681c7a..427b39deeebb831 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
@@ -214,57 +214,31 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movq %xmm1, (%r8)
; SSE-NEXT: retq
;
-; AVX1-ONLY-LABEL: load_i8_stride4_vf8:
-; AVX1-ONLY: # %bb.0:
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3
-; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5
-; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX1-ONLY-NEXT: vmovq %xmm0, (%rsi)
-; AVX1-ONLY-NEXT: vmovq %xmm3, (%rdx)
-; AVX1-ONLY-NEXT: vmovq %xmm4, (%rcx)
-; AVX1-ONLY-NEXT: vmovq %xmm1, (%r8)
-; AVX1-ONLY-NEXT: retq
-;
-; AVX2-ONLY-LABEL: load_i8_stride4_vf8:
-; AVX2-ONLY: # %bb.0:
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3
-; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5
-; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-ONLY-NEXT: vmovq %xmm3, (%rdx)
-; AVX2-ONLY-NEXT: vmovq %xmm4, (%rcx)
-; AVX2-ONLY-NEXT: vmovq %xmm1, (%r8)
-; AVX2-ONLY-NEXT: retq
+; AVX1-LABEL: load_i8_stride4_vf8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovdqa (%rdi), %xmm1
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX1-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX1-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX1-NEXT: vmovq %xmm0, (%rsi)
+; AVX1-NEXT: vmovq %xmm3, (%rdx)
+; AVX1-NEXT: vmovq %xmm4, (%rcx)
+; AVX1-NEXT: vmovq %xmm1, (%r8)
+; AVX1-NEXT: retq
;
; AVX512-LABEL: load_i8_stride4_vf8:
; AVX512: # %bb.0:
@@ -405,7 +379,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX1-ONLY-LABEL: load_i8_stride4_vf16:
; AVX1-ONLY: # %bb.0:
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,4,8,12,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3
@@ -413,34 +387,34 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,1,5,9,13,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,2,6,10,14,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm6
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,3,7,11,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -457,38 +431,38 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,4,8,12,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,1,5,9,13,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,2,6,10,14,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm7
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,3,7,11,15,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -774,13 +748,13 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX1-ONLY-LABEL: load_i8_stride4_vf32:
; AVX1-ONLY: # %bb.0:
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,4,8,12,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm1
; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm3
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm4
; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5
@@ -800,11 +774,11 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,1,5,9,13,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm10
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm11
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm13
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
@@ -818,11 +792,11 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,2,6,10,14,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm12
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm14
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
@@ -836,11 +810,11 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,3,7,11,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
@@ -869,7 +843,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4
; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,0,4,8,12,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm7
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm6
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
@@ -885,7 +859,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm6, %ymm7
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,1,5,9,13,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm9
; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm8
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
@@ -900,7 +874,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm6, %ymm9
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,2,6,10,14,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm10
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm9
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
@@ -915,7 +889,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm6, %ymm10
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,3,7,11,15,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm5
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm4
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
@@ -1536,7 +1510,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-LABEL: load_i8_stride4_vf64:
; AVX1-ONLY: # %bb.0:
; AVX1-ONLY-NEXT: subq $328, %rsp # imm = 0x148
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,4,8,12,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6
@@ -1545,7 +1519,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm8
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm5
@@ -1605,13 +1579,13 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,1,5,9,13,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm4
; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -1657,12 +1631,12 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,2,6,10,14,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
@@ -1699,13 +1673,13 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,3,7,11,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
@@ -1768,7 +1742,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5
; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm12
; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,4,8,12,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm1
; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm8
@@ -1811,7 +1785,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,1,5,9,13,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm13
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm15
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1]
@@ -1851,7 +1825,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,2,6,10,14,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm3
@@ -1883,7 +1857,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,3,7,11,15,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm6
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
index 130207fd9c2ebcb..03139923ae5c466 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
@@ -150,47 +150,26 @@ define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movd %xmm3, (%r9)
; SSE-NEXT: retq
;
-; AVX1-ONLY-LABEL: load_i8_stride5_vf4:
-; AVX1-ONLY: # %bb.0:
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
-; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm3
-; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
-; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
-; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5
-; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
-; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6
-; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
-; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX1-ONLY-NEXT: vmovd %xmm3, (%rsi)
-; AVX1-ONLY-NEXT: vmovd %xmm4, (%rdx)
-; AVX1-ONLY-NEXT: vmovd %xmm5, (%rcx)
-; AVX1-ONLY-NEXT: vmovd %xmm6, (%r8)
-; AVX1-ONLY-NEXT: vmovd %xmm0, (%r9)
-; AVX1-ONLY-NEXT: retq
-;
-; AVX2-LABEL: load_i8_stride5_vf4:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
-; AVX2-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm3
-; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
-; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
-; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm5
-; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
-; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6
-; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
-; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovd %xmm3, (%rsi)
-; AVX2-NEXT: vmovd %xmm4, (%rdx)
-; AVX2-NEXT: vmovd %xmm5, (%rcx)
-; AVX2-NEXT: vmovd %xmm6, (%r8)
-; AVX2-NEXT: vmovd %xmm0, (%r9)
-; AVX2-NEXT: retq
+; AVX-LABEL: load_i8_stride5_vf4:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vmovdqa (%rdi), %xmm1
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm3
+; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
+; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm4
+; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
+; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5
+; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
+; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm6
+; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovd %xmm3, (%rsi)
+; AVX-NEXT: vmovd %xmm4, (%rdx)
+; AVX-NEXT: vmovd %xmm5, (%rcx)
+; AVX-NEXT: vmovd %xmm6, (%r8)
+; AVX-NEXT: vmovd %xmm0, (%r9)
+; AVX-NEXT: retq
%wide.vec = load <20 x i8>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
%strided.vec1 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
@@ -707,7 +686,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm1[u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm4, %xmm5, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm5
@@ -1551,7 +1530,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm11, %xmm5
; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm11
@@ -3169,7 +3148,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12
; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm6
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm6, %xmm1
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1
@@ -3191,11 +3170,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,1,6,11,128,128,128,128,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,128,128,128,0,5,10,15,0]
-; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm6
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128]
@@ -3351,8 +3328,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm12
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6,7]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,2,7,12,128,128,128,0,0]
-; AVX1-ONLY-NEXT: # xmm14 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm0
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5]
@@ -3485,8 +3461,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11,u,u,u,u]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,128,128,128,3,8,13,0,0]
-; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm12
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5],xmm12[6,7]
@@ -3494,8 +3469,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm13
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0]
-; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
index 899f38951342e05..bc1d1fd47404235 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
@@ -380,8 +380,7 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,1,2,3,4,128,128,128]
-; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5
@@ -825,7 +824,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm10, %xmm11, %xmm10
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,8,14]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero
@@ -1728,7 +1727,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u]
@@ -1743,13 +1742,11 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0]
-; AVX1-ONLY-NEXT: # xmm8 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm8 = [128,128,128,2,8,14,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm1
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0]
-; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,6,12,128,128,128,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm3
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1
@@ -1763,9 +1760,9 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm10
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm10, %xmm4
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [0,0,4,10,0,0,4,10,0,0,4,10,0,0,4,10]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm15 = [0,0,4,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm4
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,6,12,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm10
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1]
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm8
@@ -1804,10 +1801,10 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm0
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [0,0,5,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm15
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,1,7,13,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1]
@@ -3597,14 +3594,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm9
; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm10
; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm11
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [128,128,128,4,10,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [2,8,14,128,128,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm1 = [2,8,14,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm13
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,0,6,12,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm15
; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3625,16 +3620,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [3,9,15,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,1,7,13,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm2
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [128,128,128,5,11,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [3,9,15,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm14
; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm15
@@ -3652,12 +3645,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm12
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0]
-; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [128,128,0,6,12,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1
@@ -3668,7 +3659,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm2
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm1
@@ -3682,13 +3673,11 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [5,11,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,128,128,1,7,13,0,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14
@@ -3715,13 +3704,11 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [128,128,128,2,8,14,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0]
-; AVX1-ONLY-NEXT: # xmm15 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm15 = [0,6,12,128,128,128,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1
@@ -3737,9 +3724,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,4,10,0,0,4,10,0,0,4,10,0,0,4,10]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [0,0,4,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm4
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,6,12,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3825,12 +3812,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm7, %ymm0
; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [128,128,128,3,9,15,0,0,128,128,128,3,9,15,0,0]
-; AVX1-ONLY-NEXT: # xmm8 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm8 = [128,128,128,3,9,15,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0]
-; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = [1,7,13,128,128,128,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
@@ -3842,10 +3827,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm14
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm14, %xmm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm14
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,1,7,13,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm15
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
@@ -3919,12 +3904,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1
; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0]
-; AVX1-ONLY-NEXT: # xmm9 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [128,128,128,4,10,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm13 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm13 = [2,8,14,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1
@@ -3941,9 +3924,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm8, %ymm1
; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm13 = [2,8,14,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm1
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,0,6,12,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm5
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -3987,11 +3970,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0
; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0]
-; AVX1-ONLY-NEXT: # xmm14 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [128,128,128,5,11,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm13 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm13 = [3,9,15,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm3
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0
@@ -4009,10 +3990,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0
; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [3,9,15,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,0,1,7,13,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
@@ -4054,11 +4035,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2
; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0]
-; AVX1-ONLY-NEXT: # xmm14 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [128,128,0,6,12,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm2
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0
@@ -4121,12 +4100,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm15 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm15 = [5,11,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm3
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,1,7,13,0,0,0,128,128,1,7,13,0,0,0]
-; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm5
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index 718d4973f97e251..a22dc7cb7be0cd3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -213,161 +213,44 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movd %xmm0, (%rax)
; SSE-NEXT: retq
;
-; AVX1-ONLY-LABEL: load_i8_stride7_vf4:
-; AVX1-ONLY: # %bb.0:
-; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
-; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm5
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
-; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm7
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
-; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm9
-; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
-; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1
-; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-ONLY-NEXT: vmovd %xmm2, (%rsi)
-; AVX1-ONLY-NEXT: vmovd %xmm3, (%rdx)
-; AVX1-ONLY-NEXT: vmovd %xmm5, (%rcx)
-; AVX1-ONLY-NEXT: vmovd %xmm7, (%r8)
-; AVX1-ONLY-NEXT: vmovd %xmm4, (%r9)
-; AVX1-ONLY-NEXT: vmovd %xmm6, (%r10)
-; AVX1-ONLY-NEXT: vmovd %xmm0, (%rax)
-; AVX1-ONLY-NEXT: retq
-;
-; AVX2-ONLY-LABEL: load_i8_stride7_vf4:
-; AVX2-ONLY: # %bb.0:
-; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
-; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm5
-; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
-; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm7
-; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
-; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm9
-; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
-; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1
-; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-ONLY-NEXT: vmovd %xmm2, (%rsi)
-; AVX2-ONLY-NEXT: vmovd %xmm3, (%rdx)
-; AVX2-ONLY-NEXT: vmovd %xmm5, (%rcx)
-; AVX2-ONLY-NEXT: vmovd %xmm7, (%r8)
-; AVX2-ONLY-NEXT: vmovd %xmm4, (%r9)
-; AVX2-ONLY-NEXT: vmovd %xmm6, (%r10)
-; AVX2-ONLY-NEXT: vmovd %xmm0, (%rax)
-; AVX2-ONLY-NEXT: retq
-;
-; AVX512F-LABEL: load_i8_stride7_vf4:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
-; AVX512F-NEXT: vpshufb %xmm4, %xmm0, %xmm5
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
-; AVX512F-NEXT: vpshufb %xmm6, %xmm0, %xmm7
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512F-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
-; AVX512F-NEXT: vpshufb %xmm8, %xmm0, %xmm9
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512F-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
-; AVX512F-NEXT: vpshufb %xmm8, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vmovd %xmm2, (%rsi)
-; AVX512F-NEXT: vmovd %xmm3, (%rdx)
-; AVX512F-NEXT: vmovd %xmm5, (%rcx)
-; AVX512F-NEXT: vmovd %xmm7, (%r8)
-; AVX512F-NEXT: vmovd %xmm4, (%r9)
-; AVX512F-NEXT: vmovd %xmm6, (%r10)
-; AVX512F-NEXT: vmovd %xmm0, (%rax)
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: load_i8_stride7_vf4:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm0, %xmm5
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
-; AVX512BW-NEXT: vpshufb %xmm6, %xmm0, %xmm7
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
-; AVX512BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
-; AVX512BW-NEXT: vpshufb %xmm8, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vmovd %xmm2, (%rsi)
-; AVX512BW-NEXT: vmovd %xmm3, (%rdx)
-; AVX512BW-NEXT: vmovd %xmm5, (%rcx)
-; AVX512BW-NEXT: vmovd %xmm7, (%r8)
-; AVX512BW-NEXT: vmovd %xmm4, (%r9)
-; AVX512BW-NEXT: vmovd %xmm6, (%r10)
-; AVX512BW-NEXT: vmovd %xmm0, (%rax)
-; AVX512BW-NEXT: retq
+; AVX-LABEL: load_i8_stride7_vf4:
+; AVX: # %bb.0:
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm5
+; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm7
+; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm9
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
+; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm6
+; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vmovd %xmm2, (%rsi)
+; AVX-NEXT: vmovd %xmm3, (%rdx)
+; AVX-NEXT: vmovd %xmm5, (%rcx)
+; AVX-NEXT: vmovd %xmm7, (%r8)
+; AVX-NEXT: vmovd %xmm4, (%r9)
+; AVX-NEXT: vmovd %xmm6, (%r10)
+; AVX-NEXT: vmovd %xmm0, (%rax)
+; AVX-NEXT: retq
%wide.vec = load <28 x i8>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 0, i32 7, i32 14, i32 21>
%strided.vec1 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 1, i32 8, i32 15, i32 22>
@@ -628,7 +511,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4
; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,0,0,7,14,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,0,0,0]
; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
@@ -654,7 +537,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm10
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
@@ -1403,7 +1286,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,3,10],zero,zero,zero,xmm3[u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm5, %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1
@@ -1415,7 +1298,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm11 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm8, %xmm9, %xmm8
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u]
@@ -1455,7 +1338,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[5,12]
; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13
; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm11, %xmm13, %xmm11
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm13 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm14
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
@@ -2935,7 +2818,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,3,10],zero,zero,zero,xmm15[u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u,u,u]
@@ -2952,7 +2835,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,4,11],zero,zero,xmm15[u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u],zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u]
@@ -2961,7 +2844,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,255,255,255,255,255,0,0,0,0,0,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u]
@@ -2988,7 +2871,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,255,255,255,255,255,0,0,0,0,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[0,7,14,u,u,u,u,u,u,u,u,u]
@@ -3034,13 +2917,13 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm2, %xmm3, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm2
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,0,0,6,13,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm2
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm9
@@ -3096,7 +2979,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[4,11]
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm12 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm4
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4
; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -3115,7 +2998,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm3[7]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[5,12]
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm7
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm10
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7
; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
@@ -3127,7 +3010,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vorps %ymm7, %ymm10, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm7
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
@@ -3153,7 +3036,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vorps %ymm7, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm4
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm10
; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm3
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
@@ -3179,7 +3062,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm13, %ymm5
; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm5
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [6,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[u,u,u,u,2,9],zero,zero,zero,xmm15[u,u,u,u,u,u,u]
@@ -3870,7 +3753,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpor %xmm4, %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm4
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm6
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20
; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5
@@ -4308,7 +4191,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpor %xmm9, %xmm8, %xmm8
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm13
; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm15
; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
@@ -6763,11 +6646,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10
; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8
; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,128,128,128,5,12,0,0,0]
-; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm4
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,7,14,128,128,0,0,0,0,7,14,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,7,14,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5
; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5
@@ -6781,7 +6662,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm8
; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm8
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm7, %xmm8, %xmm7
; AVX1-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm7
@@ -6806,11 +6687,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm7
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm2
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,128,128,3,10,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,5,12,128,128,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm9
; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm14
@@ -6841,10 +6722,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm5
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,128,128,4,11,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm13
; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,6,13,128,128,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm7
; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14
@@ -6863,11 +6744,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm0, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,5,12,0,0,0,128,128,128,5,12,0]
-; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,128,128,128,5,12,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm2
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,0,0,7,14,128,128,0,0,0,0,7,14,128,128,0]
-; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,7,14,128,128,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm3
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm5
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [10,128,128,128,0,0,0,3,10,128,128,128,0,0,0,3]
@@ -6877,7 +6756,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm13
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm13
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,255,255,255,255,255,0,0,0,0,0,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm4, %xmm5, %xmm13, %xmm5
; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0
@@ -6888,12 +6767,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,128,128,6,13,0,0,0,128,128,128,6,13,0,0,0]
-; AVX1-ONLY-NEXT: # xmm14 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [128,128,128,6,13,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,1,8,15,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm3
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm3
@@ -6904,7 +6781,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm13
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm13, %xmm13
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm3, %xmm13, %xmm0
; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -6920,11 +6797,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm14, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm12
; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,2,9,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm9 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm2
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,128,128,0,7,14,0,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3]
@@ -6946,11 +6821,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm6
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,3,10,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm3
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,128,128,1,8,15,0,0,0]
-; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm5
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4]
@@ -6968,12 +6841,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm1, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,128,128,128,6,13,0]
-; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm1
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,1,8,15,128,128,0,0,0,1,8,15,128,128,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,1,8,15,128,128,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm2
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
@@ -6984,7 +6855,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm13
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm13, %xmm13
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,255,255,255,255,255,0,0,0,0,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm1, %xmm13, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -6999,12 +6870,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,2,9,128,128,128,0,0,0,2,9,128,128,128,0]
-; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,2,9,128,128,128,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm15
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,0,7,14,0,0,0,128,128,0,7,14,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,128,128,0,7,14,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm5
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128]
@@ -7025,11 +6894,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm14
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,3,10,128,128,128,0]
-; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm0
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,1,8,15,0,0,0,128,128,1,8,15,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,128,128,1,8,15,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm4
@@ -7098,10 +6965,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm2, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm0 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm14
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm0 = [0,0,4,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
@@ -7141,21 +7008,21 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,0,0,2,9,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm13
; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm1
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm15 = [0,0,0,0,4,11,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm7
; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,6,13,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm10
@@ -7167,7 +7034,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15]
; AVX1-ONLY-NEXT: # xmm11 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm10
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,7,14,0,0,7,14,0,0,7,14,0,0,7,14,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm0 = [0,7,14,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm12
@@ -7219,7 +7086,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,0,0,7,14,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm10
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
@@ -7267,10 +7134,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1
; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,6,13,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
@@ -7286,7 +7153,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm7
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm1, %xmm1
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm7
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7
; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -7341,7 +7208,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm6
; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm11
; AVX1-ONLY-NEXT: vpor %xmm6, %xmm2, %xmm6
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm7
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
@@ -7371,9 +7238,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm3
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm0 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -7442,9 +7309,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3
; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm3
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -7512,7 +7379,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [6,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
@@ -7702,10 +7569,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u]
; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,2,9,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5
; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm3
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm10
; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -7743,9 +7610,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u]
; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,0,0,3,10,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm9
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,5,12,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm11
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
@@ -8067,10 +7934,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u]
; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm9
; AVX2-SLOW-NEXT: vpor %xmm5, %xmm9, %xmm5
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm9
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,6,13,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm11
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
@@ -8773,10 +8640,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,2,9,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm5
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm3
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm10
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -8814,9 +8681,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,0,0,3,10,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm9
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm12 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,5,12,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm11
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
@@ -9138,10 +9005,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm9, %xmm9
; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm9, %xmm5
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm9
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,6,13,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm11, %xmm11
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
@@ -9247,7 +9114,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1
; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0
-; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
+; AVX512F-ONLY-SLOW-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0]
; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm3
; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm27
; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20
@@ -9561,7 +9428,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm1, %ymm2
; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %ymm17, %ymm7, %ymm28
-; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX512F-ONLY-SLOW-NEXT: vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm21, %ymm15
; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm29, %ymm7
; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -9977,7 +9844,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm2, %ymm11, %ymm3
; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %ymm15, %ymm14, %ymm16
-; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX512F-ONLY-FAST-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm13
; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm29, %ymm14
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -10093,7 +9960,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1
; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0
-; AVX512DQ-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
+; AVX512DQ-SLOW-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0]
; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm3
; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm30
; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20
@@ -10412,7 +10279,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm18, %ymm2
; AVX512DQ-SLOW-NEXT: vpternlogq $226, %ymm21, %ymm7, %ymm16
-; AVX512DQ-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX512DQ-SLOW-NEXT: vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm15
; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm13, %ymm6, %ymm7
; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -10829,7 +10696,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm2, %ymm11, %ymm8
; AVX512DQ-FAST-NEXT: vpternlogq $226, %ymm15, %ymm14, %ymm16
-; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX512DQ-FAST-NEXT: vmovd {{.*#+}} xmm6 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm0, %ymm13
; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm0, %ymm31, %ymm14
; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -10944,7 +10811,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0
; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7
-; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512BW-ONLY-SLOW-NEXT: vmovq {{.*#+}} xmm21 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0]
; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm3
; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm8
; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
@@ -11224,7 +11091,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u]
; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11
; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512BW-ONLY-SLOW-NEXT: vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm12
; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
@@ -11578,7 +11445,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u]
; AVX512BW-ONLY-FAST-NEXT: vpor %xmm2, %xmm0, %xmm2
; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-ONLY-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512BW-ONLY-FAST-NEXT: vmovd {{.*#+}} xmm0 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm11
; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm22[0],xmm11[1],xmm22[1],xmm11[2],xmm22[2],xmm11[3],xmm22[3]
@@ -11680,7 +11547,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0
; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7
-; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512DQBW-SLOW-NEXT: vmovq {{.*#+}} xmm21 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0]
; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm3
; AVX512DQBW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm8
; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
@@ -11960,7 +11827,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u]
; AVX512DQBW-SLOW-NEXT: vpor %xmm2, %xmm12, %xmm2
; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512DQBW-SLOW-NEXT: vmovd {{.*#+}} xmm11 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm12
; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
@@ -12311,7 +12178,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[4,11],zero,zero,xmm21[0,7,14],zero,zero,xmm21[u,u,u,u,u,u,u]
; AVX512DQBW-FAST-NEXT: vporq %xmm2, %xmm21, %xmm2
; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQBW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512DQBW-FAST-NEXT: vmovd {{.*#+}} xmm21 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512DQBW-FAST-NEXT: vpshufb %xmm21, %xmm8, %xmm11
; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm22[0],xmm11[1],xmm22[1],xmm11[2],xmm22[2],xmm11[3],xmm22[3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
index f2133b9e42d30df..24f477064ed75f6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
@@ -235,103 +235,54 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movd %xmm1, (%rax)
; SSE-NEXT: retq
;
-; AVX1-ONLY-LABEL: load_i8_stride8_vf4:
-; AVX1-ONLY: # %bb.0:
-; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3
-; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5
-; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm6
-; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm5
-; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm7
-; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
-; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7
-; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm9
-; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm8
-; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2
-; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1
-; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX1-ONLY-NEXT: vmovd %xmm0, (%rsi)
-; AVX1-ONLY-NEXT: vmovd %xmm3, (%rdx)
-; AVX1-ONLY-NEXT: vmovd %xmm4, (%rcx)
-; AVX1-ONLY-NEXT: vmovd %xmm5, (%r8)
-; AVX1-ONLY-NEXT: vmovd %xmm6, (%r9)
-; AVX1-ONLY-NEXT: vmovd %xmm7, (%r11)
-; AVX1-ONLY-NEXT: vmovd %xmm8, (%r10)
-; AVX1-ONLY-NEXT: vmovd %xmm1, (%rax)
-; AVX1-ONLY-NEXT: retq
-;
-; AVX2-ONLY-LABEL: load_i8_stride8_vf4:
-; AVX2-ONLY: # %bb.0:
-; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3
-; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5
-; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm6
-; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm5
-; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm7
-; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm7 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7
-; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm8 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm9
-; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm8
-; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2
-; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1
-; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-ONLY-NEXT: vmovd %xmm3, (%rdx)
-; AVX2-ONLY-NEXT: vmovd %xmm4, (%rcx)
-; AVX2-ONLY-NEXT: vmovd %xmm5, (%r8)
-; AVX2-ONLY-NEXT: vmovd %xmm6, (%r9)
-; AVX2-ONLY-NEXT: vmovd %xmm7, (%r11)
-; AVX2-ONLY-NEXT: vmovd %xmm8, (%r10)
-; AVX2-ONLY-NEXT: vmovd %xmm1, (%rax)
-; AVX2-ONLY-NEXT: retq
+; AVX1-LABEL: load_i8_stride8_vf4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovdqa (%rdi), %xmm1
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX1-NEXT: vmovd {{.*#+}} xmm4 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX1-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm6
+; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm5
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX1-NEXT: vmovd {{.*#+}} xmm6 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm7
+; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm6
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX1-NEXT: vmovd {{.*#+}} xmm7 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm8
+; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm7
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX1-NEXT: vmovd {{.*#+}} xmm8 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm9
+; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm8
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX1-NEXT: vmovd {{.*#+}} xmm9 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: vmovd %xmm3, (%rdx)
+; AVX1-NEXT: vmovd %xmm4, (%rcx)
+; AVX1-NEXT: vmovd %xmm5, (%r8)
+; AVX1-NEXT: vmovd %xmm6, (%r9)
+; AVX1-NEXT: vmovd %xmm7, (%r11)
+; AVX1-NEXT: vmovd %xmm8, (%r10)
+; AVX1-NEXT: vmovd %xmm1, (%rax)
+; AVX1-NEXT: retq
;
; AVX512-LABEL: load_i8_stride8_vf4:
; AVX512: # %bb.0:
@@ -592,7 +543,7 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2
@@ -600,70 +551,70 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm8
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm7
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm9
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm8
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm9
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm8
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm10
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm10
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm9
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm10 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm10 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm11
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -687,74 +638,74 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm7
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm8
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm7
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm9
; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm8
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm9
; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm8
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm10
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm10
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm9
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm10 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm11
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm10
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm10 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm11
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm10
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm12
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm11
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1274,20 +1225,20 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX1-ONLY-LABEL: load_i8_stride8_vf16:
; AVX1-ONLY: # %bb.0:
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm8
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm3
; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm6
; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm3
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6
@@ -1295,143 +1246,143 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm10
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm9
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm10 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm10
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm9
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4,5],xmm9[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm10 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm11
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm9[4,5,6,7]
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm11
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,5],xmm10[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm11
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm12 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm12
; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm9
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3],xmm12[4,5,6,7]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm11
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm12
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,5],xmm11[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm12 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm12
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm13 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm14
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm13
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3],xmm13[4,5,6,7]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm12
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm14
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm13
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5],xmm12[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm13 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm14
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm13
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm15
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm14
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3],xmm14[4,5,6,7]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm14
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm13
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm15
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5],xmm13[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm14
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm15 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm15
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3],xmm0[4,5,6,7]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm14
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm15
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm14
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm15 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm15
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3],xmm0[4,5,6,7]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm8
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -1455,13 +1406,13 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-LABEL: load_i8_stride8_vf16:
; AVX2-ONLY: # %bb.0:
; AVX2-ONLY-NEXT: vmovdqa 112(%rdi), %xmm8
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm6
; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5
; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2
@@ -1471,147 +1422,147 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6
; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm10
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm9
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm10 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm11
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm10
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm9 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm10
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm9
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm11
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm10
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm10 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm11
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm10
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm12
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm11
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm9[2,3]
; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm11
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm10
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm12
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm11
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm12
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm11
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm12 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm13
; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm12
; AVX2-ONLY-NEXT: vmovdqa %xmm1, %xmm9
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm12
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm11
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm13
; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm12
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm12 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm13
; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm12
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm13 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm13 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm14
; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm13
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm13
; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm12
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm14
; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm13
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm13 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm14
; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm13
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm15
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm14
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm14
; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm13
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm15
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm14
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm15 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm0
; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm15
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm14
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm15
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm0[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm14
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm15 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm0
; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm15
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm8
; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm3
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm4
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm3
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm4
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
@@ -1632,237 +1583,121 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rax)
; AVX2-ONLY-NEXT: retq
;
-; AVX512F-LABEL: load_i8_stride8_vf16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm0
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512F-NEXT: vpshufb %xmm5, %xmm2, %xmm6
-; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512F-NEXT: vpshufb %xmm5, %xmm3, %xmm5
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5
-; AVX512F-NEXT: vpmovqb %zmm5, %xmm6
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512F-NEXT: vpshufb %xmm6, %xmm0, %xmm7
-; AVX512F-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512F-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX512F-NEXT: vpshufb %xmm7, %xmm3, %xmm7
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512F-NEXT: vpsrlq $8, %zmm5, %zmm7
-; AVX512F-NEXT: vpmovqb %zmm7, %xmm7
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512F-NEXT: vpshufb %xmm7, %xmm0, %xmm8
-; AVX512F-NEXT: vpshufb %xmm7, %xmm1, %xmm7
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512F-NEXT: vpshufb %xmm8, %xmm2, %xmm9
-; AVX512F-NEXT: vpshufb %xmm8, %xmm3, %xmm8
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512F-NEXT: vpsrlq $16, %zmm5, %zmm8
-; AVX512F-NEXT: vpmovqb %zmm8, %xmm8
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512F-NEXT: vpshufb %xmm8, %xmm0, %xmm9
-; AVX512F-NEXT: vpshufb %xmm8, %xmm1, %xmm8
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512F-NEXT: vpshufb %xmm9, %xmm2, %xmm10
-; AVX512F-NEXT: vpshufb %xmm9, %xmm3, %xmm9
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512F-NEXT: vpsrlq $24, %zmm5, %zmm9
-; AVX512F-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512F-NEXT: vpshufb %xmm9, %xmm0, %xmm10
-; AVX512F-NEXT: vpshufb %xmm9, %xmm1, %xmm9
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512F-NEXT: vpshufb %xmm10, %xmm2, %xmm11
-; AVX512F-NEXT: vpshufb %xmm10, %xmm3, %xmm10
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512F-NEXT: vpsrlq $32, %zmm5, %zmm10
-; AVX512F-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512F-NEXT: vpshufb %xmm10, %xmm0, %xmm11
-; AVX512F-NEXT: vpshufb %xmm10, %xmm1, %xmm10
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
-; AVX512F-NEXT: vpshufb %xmm11, %xmm2, %xmm12
-; AVX512F-NEXT: vpshufb %xmm11, %xmm3, %xmm11
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512F-NEXT: vpsrlq $40, %zmm5, %zmm11
-; AVX512F-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512F-NEXT: vpshufb %xmm11, %xmm0, %xmm12
-; AVX512F-NEXT: vpshufb %xmm11, %xmm1, %xmm11
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512F-NEXT: vpshufb %xmm12, %xmm2, %xmm13
-; AVX512F-NEXT: vpshufb %xmm12, %xmm3, %xmm12
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512F-NEXT: vpsrlq $48, %zmm5, %zmm12
-; AVX512F-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512F-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512F-NEXT: vpshufb %xmm12, %xmm1, %xmm1
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512F-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512F-NEXT: vpsrlq $56, %zmm5, %zmm1
-; AVX512F-NEXT: vpmovqb %zmm1, %xmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512F-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512F-NEXT: vmovdqa %xmm6, (%rdx)
-; AVX512F-NEXT: vmovdqa %xmm7, (%rcx)
-; AVX512F-NEXT: vmovdqa %xmm8, (%r8)
-; AVX512F-NEXT: vmovdqa %xmm9, (%r9)
-; AVX512F-NEXT: vmovdqa %xmm10, (%r11)
-; AVX512F-NEXT: vmovdqa %xmm11, (%r10)
-; AVX512F-NEXT: vmovdqa %xmm0, (%rax)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: load_i8_stride8_vf16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm6
-; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512BW-NEXT: vpshufb %xmm5, %xmm3, %xmm5
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5
-; AVX512BW-NEXT: vpmovqb %zmm5, %xmm6
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT: vpshufb %xmm6, %xmm0, %xmm7
-; AVX512BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX512BW-NEXT: vpshufb %xmm7, %xmm3, %xmm7
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512BW-NEXT: vpsrlq $8, %zmm5, %zmm7
-; AVX512BW-NEXT: vpmovqb %zmm7, %xmm7
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT: vpshufb %xmm7, %xmm0, %xmm8
-; AVX512BW-NEXT: vpshufb %xmm7, %xmm1, %xmm7
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT: vpshufb %xmm8, %xmm2, %xmm9
-; AVX512BW-NEXT: vpshufb %xmm8, %xmm3, %xmm8
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512BW-NEXT: vpsrlq $16, %zmm5, %zmm8
-; AVX512BW-NEXT: vpmovqb %zmm8, %xmm8
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9
-; AVX512BW-NEXT: vpshufb %xmm8, %xmm1, %xmm8
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: vpshufb %xmm9, %xmm2, %xmm10
-; AVX512BW-NEXT: vpshufb %xmm9, %xmm3, %xmm9
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512BW-NEXT: vpsrlq $24, %zmm5, %zmm9
-; AVX512BW-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT: vpshufb %xmm9, %xmm0, %xmm10
-; AVX512BW-NEXT: vpshufb %xmm9, %xmm1, %xmm9
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm2, %xmm11
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm3, %xmm10
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512BW-NEXT: vpsrlq $32, %zmm5, %zmm10
-; AVX512BW-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm0, %xmm11
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm1, %xmm10
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT: vpshufb %xmm11, %xmm2, %xmm12
-; AVX512BW-NEXT: vpshufb %xmm11, %xmm3, %xmm11
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512BW-NEXT: vpsrlq $40, %zmm5, %zmm11
-; AVX512BW-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: vpshufb %xmm11, %xmm0, %xmm12
-; AVX512BW-NEXT: vpshufb %xmm11, %xmm1, %xmm11
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: vpshufb %xmm12, %xmm2, %xmm13
-; AVX512BW-NEXT: vpshufb %xmm12, %xmm3, %xmm12
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512BW-NEXT: vpsrlq $48, %zmm5, %zmm12
-; AVX512BW-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufb %xmm12, %xmm1, %xmm1
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512BW-NEXT: vpsrlq $56, %zmm5, %zmm1
-; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512BW-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512BW-NEXT: vmovdqa %xmm6, (%rdx)
-; AVX512BW-NEXT: vmovdqa %xmm7, (%rcx)
-; AVX512BW-NEXT: vmovdqa %xmm8, (%r8)
-; AVX512BW-NEXT: vmovdqa %xmm9, (%r9)
-; AVX512BW-NEXT: vmovdqa %xmm10, (%r11)
-; AVX512BW-NEXT: vmovdqa %xmm11, (%r10)
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rax)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: load_i8_stride8_vf16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vmovdqa 112(%rdi), %xmm0
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm3
+; AVX512-NEXT: vmovdqa 96(%rdi), %xmm1
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm2
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vmovdqa 80(%rdi), %xmm2
+; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm6
+; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3
+; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512-NEXT: vpmovqb %zmm5, %xmm6
+; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX512-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm7
+; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm6
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm8
+; AVX512-NEXT: vpshufb %xmm7, %xmm3, %xmm7
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
+; AVX512-NEXT: vpsrlq $8, %zmm5, %zmm7
+; AVX512-NEXT: vpmovqb %zmm7, %xmm7
+; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm8
+; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm7
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm8, %xmm2, %xmm9
+; AVX512-NEXT: vpshufb %xmm8, %xmm3, %xmm8
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
+; AVX512-NEXT: vpsrlq $16, %zmm5, %zmm8
+; AVX512-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
+; AVX512-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm9
+; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm8
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm10
+; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm9
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
+; AVX512-NEXT: vpsrlq $24, %zmm5, %zmm9
+; AVX512-NEXT: vpmovqb %zmm9, %xmm9
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm9, %xmm0, %xmm10
+; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm9
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm11
+; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm10
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
+; AVX512-NEXT: vpsrlq $32, %zmm5, %zmm10
+; AVX512-NEXT: vpmovqb %zmm10, %xmm10
+; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
+; AVX512-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm11
+; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm10
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm11, %xmm2, %xmm12
+; AVX512-NEXT: vpshufb %xmm11, %xmm3, %xmm11
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
+; AVX512-NEXT: vpsrlq $40, %zmm5, %zmm11
+; AVX512-NEXT: vpmovqb %zmm11, %xmm11
+; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
+; AVX512-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm12
+; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm11
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm12, %xmm2, %xmm13
+; AVX512-NEXT: vpshufb %xmm12, %xmm3, %xmm12
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
+; AVX512-NEXT: vpsrlq $48, %zmm5, %zmm12
+; AVX512-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
+; AVX512-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm2
+; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512-NEXT: vpsrlq $56, %zmm5, %zmm1
+; AVX512-NEXT: vpmovqb %zmm1, %xmm1
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512-NEXT: vmovdqa %xmm4, (%rsi)
+; AVX512-NEXT: vmovdqa %xmm6, (%rdx)
+; AVX512-NEXT: vmovdqa %xmm7, (%rcx)
+; AVX512-NEXT: vmovdqa %xmm8, (%r8)
+; AVX512-NEXT: vmovdqa %xmm9, (%r9)
+; AVX512-NEXT: vmovdqa %xmm10, (%r11)
+; AVX512-NEXT: vmovdqa %xmm11, (%r10)
+; AVX512-NEXT: vmovdqa %xmm0, (%rax)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%wide.vec = load <128 x i8>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <128 x i8> %wide.vec, <128 x i8> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>
%strided.vec1 = shufflevector <128 x i8> %wide.vec, <128 x i8> poison, <16 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121>
@@ -2832,7 +2667,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-LABEL: load_i8_stride8_vf32:
; AVX1-ONLY: # %bb.0:
; AVX1-ONLY-NEXT: subq $360, %rsp # imm = 0x168
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2841,7 +2676,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm3
@@ -2850,7 +2685,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
@@ -2862,7 +2697,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm9
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm11
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm13
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
@@ -2904,23 +2739,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm0
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -2953,23 +2788,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm7
@@ -3001,23 +2836,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm8
@@ -3049,25 +2884,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm7
@@ -3098,23 +2933,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -3148,24 +2983,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm7
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
@@ -3195,25 +3030,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -3267,7 +3102,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: subq $360, %rsp # imm = 0x168
; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm4
; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm2
@@ -3275,7 +3110,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1
; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm2
; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5
; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm6
@@ -3289,11 +3124,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3
; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm14 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm9
; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm11
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm11
; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm13
@@ -3337,24 +3172,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm3
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm5 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm0
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm7
@@ -3388,24 +3223,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm3
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm5
; AVX2-SLOW-NEXT: vmovdqa %xmm9, %xmm15
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm5 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm7
@@ -3439,24 +3274,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm13
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm4
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
@@ -3490,24 +3325,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm4
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm5 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm7
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
@@ -3540,24 +3375,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm11
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -3591,24 +3426,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm4
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -3641,24 +3476,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm4
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7
@@ -3718,13 +3553,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm7
; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm8
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm1
; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm13
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm3
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2
; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm10
; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3
@@ -3737,12 +3572,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm5
; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm15
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm1 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm9
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1
; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm6
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm9 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm11
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm9
; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm5
@@ -3768,11 +3603,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm4
; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3
@@ -3780,12 +3615,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm2 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3
; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm3 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm12
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3
@@ -3803,21 +3638,21 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm2
; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm4
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm2 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm3 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm12
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3
; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm1
@@ -3835,26 +3670,26 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm5
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm8
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm3
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm2 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3
; AVX2-FAST-NEXT: vmovdqa %xmm15, %xmm10
; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm3 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm12
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm3
@@ -3872,24 +3707,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm1
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0
; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm9
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm2
; AVX2-FAST-NEXT: vmovdqa %xmm6, %xmm8
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm1 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm2
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1
; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm14
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm2 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm3
; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm6
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2
@@ -3911,22 +3746,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm0
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm2
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm3
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm2 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm3
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm3 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
@@ -3941,21 +3776,21 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3
; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm2
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm4
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm3
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm4
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm4 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm0
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm4
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
@@ -3969,20 +3804,20 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm3
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm0
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm4
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm3
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm3 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm4
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm4 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm6
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm4
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
@@ -4022,7 +3857,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: subq $360, %rsp # imm = 0x168
; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm4
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm2
@@ -4030,7 +3865,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm5
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm6
@@ -4044,11 +3879,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm14 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm14 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm3, %xmm9
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm11
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm3 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm11
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm13
@@ -4092,24 +3927,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm3
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm5 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm7
@@ -4143,24 +3978,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm5
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm15
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm5 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm7
@@ -4194,24 +4029,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm13
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm3 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
@@ -4245,24 +4080,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm3
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm3 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm5 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm7
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
@@ -4295,24 +4130,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm11
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -4346,24 +4181,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -4396,24 +4231,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm3 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7
@@ -4471,7 +4306,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0
; AVX512F-SLOW-NEXT: vpmovqb %zmm0, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm2
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm12
; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4
@@ -4480,7 +4315,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm2
; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm9
; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm5
@@ -4515,13 +4350,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17
; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm6
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm4
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm19
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm12
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm12
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm22
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm13
@@ -4529,14 +4364,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,6],ymm4[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm12 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm13
; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12
; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm10
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8
; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm9
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm1 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm13
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1
@@ -4562,13 +4397,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm2
; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm9
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm8
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm7
@@ -4577,12 +4412,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm14
; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm12
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm14 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm14
@@ -4606,12 +4441,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm25
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm22
@@ -4619,12 +4454,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm14
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm27
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm14 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm12
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
@@ -4648,13 +4483,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm11
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm1
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm2
; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm7
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm9
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14
@@ -4662,13 +4497,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm14
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm22
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm25
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
@@ -4691,12 +4526,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm1
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm2
; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm13
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm26
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4
@@ -4705,13 +4540,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm6
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm4
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm7
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm0
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm4
@@ -4735,12 +4570,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm23
; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm11
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm5
@@ -4749,11 +4584,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm5
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm14
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm5
@@ -4776,12 +4611,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm2
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm4
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm5
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5
@@ -4789,11 +4624,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm5
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm7
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm5
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
@@ -4846,14 +4681,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm1
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm5
; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm2
; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm1
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm3
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm12
; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm10
; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm5
@@ -4875,11 +4710,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm28
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm8
; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm7
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm10
; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm8
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
@@ -4904,12 +4739,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm8
; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm7
; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm3
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm10
; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm8
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
@@ -4933,11 +4768,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm6
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm6
; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm9
; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm6
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
@@ -4959,12 +4794,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm6
; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm15
; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm14
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm15 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm0
; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm15
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
@@ -4982,12 +4817,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm14
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm14
; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm7
; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm0
; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
@@ -5007,11 +4842,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm14
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm14
; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm0
; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
@@ -5030,11 +4865,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm3
; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm8
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
@@ -5066,14 +4901,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0
; AVX512BW-SLOW-NEXT: vpmovqb %zmm0, %xmm0
; AVX512BW-SLOW-NEXT: vmovdqa 240(%rdi), %xmm2
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm1
; AVX512BW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm6
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm3
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm7
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm3
; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm9
; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm8
@@ -5105,23 +4940,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vmovdqa 144(%rdi), %xmm11
; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %xmm12
; AVX512BW-SLOW-NEXT: vmovdqa 176(%rdi), %xmm13
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm15
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm17
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm15 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm15
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm17 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18
; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
@@ -5140,23 +4975,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm14
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm15
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm14 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm15 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm17
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm19, %xmm15
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
@@ -5175,23 +5010,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm14
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm15
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm14 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm15 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm17
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm19, %xmm15
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
@@ -5210,23 +5045,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm14
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm17
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm14 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm17
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm17 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18
; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
@@ -5245,23 +5080,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm15
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm17
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm15 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm15
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm17 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18
; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
@@ -5280,23 +5115,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm10
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm15
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm17
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm15 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm15
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm17 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18
; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
@@ -5314,23 +5149,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7
; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm9
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm7 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm9
; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm12, %xmm7
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm9 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm10
; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm19, %xmm9
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
@@ -5385,13 +5220,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm14, %ymm28, %ymm1
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vmovdqa 112(%rdi), %xmm1
; AVX512BW-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm5
; AVX512BW-FAST-NEXT: vmovdqa 96(%rdi), %xmm2
; AVX512BW-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm3
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm16 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm16 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vmovdqa 80(%rdi), %xmm3
; AVX512BW-FAST-NEXT: vpshufb %xmm16, %xmm3, %xmm17
; AVX512BW-FAST-NEXT: vmovdqa 64(%rdi), %xmm5
@@ -5413,11 +5248,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm19, %ymm28, %ymm8
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8
; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm7
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm20
; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm8
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm20[0],xmm8[1],xmm20[1],xmm8[2],xmm20[2],xmm8[3],xmm20[3]
@@ -5438,11 +5273,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm28, %ymm10
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm10
; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm8
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm24
; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm10
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm24[0],xmm10[1],xmm24[1],xmm10[2],xmm24[2],xmm10[3],xmm24[3]
@@ -5462,11 +5297,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm27, %ymm28, %ymm10
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm10
; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm8
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm28
; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm10
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm28[0],xmm10[1],xmm28[1],xmm10[2],xmm28[2],xmm10[3],xmm28[3]
@@ -5487,11 +5322,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm8
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm13
; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm12
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm14
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm13
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
@@ -5507,11 +5342,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm19, %ymm15, %ymm14
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm13
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm16
; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3]
@@ -5527,11 +5362,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm15, %ymm6
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5],ymm6[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm13
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm16
; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3]
@@ -5547,11 +5382,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm27, %ymm15, %ymm11
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1
; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
@@ -7477,7 +7312,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-LABEL: load_i8_stride8_vf64:
; AVX1-ONLY: # %bb.0:
; AVX1-ONLY-NEXT: subq $808, %rsp # imm = 0x328
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
@@ -7485,7 +7320,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15
@@ -7495,7 +7330,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -7503,7 +7338,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm6
; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -7614,23 +7449,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm1
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm15
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
@@ -7712,24 +7547,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
@@ -7809,26 +7644,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14
@@ -7907,25 +7742,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -8006,25 +7841,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm14
@@ -8103,26 +7938,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -8201,24 +8036,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm1
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm1 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm0 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm14
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
@@ -8338,14 +8173,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: subq $840, %rsp # imm = 0x348
; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm13
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1
; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX2-SLOW-NEXT: vmovdqa 336(%rdi), %xmm4
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1
; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm15
; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -8356,14 +8191,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm9 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm1
; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm11
; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm5
; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm10
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm1 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm6
; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm8
@@ -8478,23 +8313,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm3
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm1 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm0
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm15
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
@@ -8579,25 +8414,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm0 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm1 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -8680,25 +8515,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm0
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm0 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm1 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm6
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -8783,24 +8618,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm0
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm0 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm1 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm6
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14
@@ -8885,26 +8720,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm0 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm1 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -8986,26 +8821,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm1
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm0 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm1 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
@@ -9091,24 +8926,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm1
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm1 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm15
@@ -9232,14 +9067,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: subq $904, %rsp # imm = 0x388
; AVX2-FAST-NEXT: vmovdqa 368(%rdi), %xmm0
; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2
; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX2-FAST-NEXT: vmovdqa 336(%rdi), %xmm15
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm2
; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm4
@@ -9248,7 +9083,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm14
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm4
; AVX2-FAST-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm5
@@ -9257,7 +9092,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm5
; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm8 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5
; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm6
; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9349,24 +9184,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm0
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm10
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm12
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm11
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm11 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm12
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm14
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm14 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
@@ -9422,22 +9257,22 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm0
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm1
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm9
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm10
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm9 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm10
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm11
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm11 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm12
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm14
@@ -9490,24 +9325,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm9
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm10
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm9 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm10
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm11
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm11 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm12
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm14
@@ -9529,7 +9364,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm0
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6
@@ -9562,24 +9397,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm7 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1
; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm8 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
@@ -9608,7 +9443,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm10[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm9
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm5
@@ -9647,25 +9482,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm9
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm10
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm11
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm12
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm11 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm12
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm13
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm13 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm14
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -9719,23 +9554,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm9
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm10
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm11
; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm12
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm11 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm12
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm13
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm13 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm14
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
@@ -9790,25 +9625,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm10
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm11
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm12
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm13
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm12 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm13
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm14
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
-; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm14 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm15
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm8
@@ -9907,14 +9742,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: subq $840, %rsp # imm = 0x348
; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm13
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX2-FAST-PERLANE-NEXT: vmovdqa 336(%rdi), %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm15
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9925,14 +9760,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm9 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm11
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm11, %xmm5
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm10
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm1 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm8
@@ -10047,23 +9882,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm1 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm0
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm15
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
@@ -10148,25 +9983,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm0 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm1 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -10249,25 +10084,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm0 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm1 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -10352,24 +10187,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm0 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm1 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14
@@ -10454,26 +10289,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm0 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm1 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -10555,26 +10390,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm0 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm1 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
@@ -10660,24 +10495,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm1 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm2 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm15
@@ -10803,7 +10638,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0
; AVX512F-SLOW-NEXT: vpmovqb %zmm0, %xmm2
; AVX512F-SLOW-NEXT: vmovdqa 496(%rdi), %xmm3
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm7
; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %xmm4
@@ -10812,7 +10647,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm5
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4
; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm11
; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm15
@@ -10894,13 +10729,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm7, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm24
; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm2
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm21
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm27
; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm15, %xmm4
@@ -10910,14 +10745,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovdqa 432(%rdi), %xmm11
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm2
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm22
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovdqa 400(%rdi), %xmm12
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm6
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm7
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm31
@@ -10991,7 +10826,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm13
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm1
@@ -10999,7 +10834,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm8
@@ -11008,13 +10843,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm2
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm24
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm11
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm9
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm30
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5
@@ -11086,11 +10921,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm10
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3
@@ -11100,13 +10935,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm8
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm28
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm9
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11
@@ -11174,13 +11009,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm10
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -11189,13 +11024,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm7
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm8
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm9
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm12
@@ -11264,13 +11099,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm4
@@ -11278,13 +11113,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm27
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm10
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15
@@ -11356,14 +11191,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm29
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
@@ -11372,13 +11207,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm18
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5
@@ -11447,13 +11282,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm24
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm1
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
@@ -11462,13 +11297,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm8
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm9
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11
@@ -11581,7 +11416,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm5
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vmovdqa 368(%rdi), %xmm2
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm5
; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm30
@@ -11589,7 +11424,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm6
; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm25
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vmovdqa 336(%rdi), %xmm2
; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm7
; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm24
@@ -11664,13 +11499,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm16
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm13
; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm3
; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm14
; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm5
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm9
; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm0
; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm12
@@ -11730,11 +11565,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm2
; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm3
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm5
; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm17
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm15
@@ -11793,11 +11628,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm2
; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm3
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm15
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm5
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm12
@@ -11859,13 +11694,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm10
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm9
; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm20
; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm10
; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm22
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm15, %xmm10
; AVX512F-FAST-NEXT: vmovdqa64 %xmm15, %xmm23
; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
@@ -11928,14 +11763,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm12
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm2
; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm12
; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm19
; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm4
; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm13
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm9
; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm14
; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -11992,13 +11827,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm12
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm0
; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm12
; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm13
; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm23
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm14
; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm28
; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm15
@@ -12051,13 +11886,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm4
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm2
; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm5
; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm2
; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm6
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm2
; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm11
; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm2
@@ -12122,7 +11957,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2
; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2
; AVX512BW-SLOW-NEXT: vmovdqa 496(%rdi), %xmm4
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm3
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm4, %xmm24
; AVX512BW-SLOW-NEXT: vmovdqa 480(%rdi), %xmm6
@@ -12131,7 +11966,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512BW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm6
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm19 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm4
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm26
; AVX512BW-SLOW-NEXT: vmovdqa 448(%rdi), %xmm7
@@ -12218,14 +12053,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vmovdqa 400(%rdi), %xmm3
; AVX512BW-SLOW-NEXT: vmovdqa64 416(%rdi), %xmm20
; AVX512BW-SLOW-NEXT: vmovdqa64 432(%rdi), %xmm29
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, %xmm19
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm24, %xmm1
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, %xmm11
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm13
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, %xmm12
; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm26, %xmm24
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm30, %xmm16
@@ -12234,13 +12069,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm26 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm29, %xmm24
; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm20, %xmm25
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, %xmm23
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm30 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm3, %xmm24
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm3, %xmm21
; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm7, %xmm25
@@ -12305,12 +12140,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm19, %xmm2
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm19, %xmm20
; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm3
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm12, %xmm25
; AVX512BW-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -12319,14 +12154,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm29, %xmm24
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm29, %xmm13
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm30
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, %xmm16
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm30 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm21, %xmm18
; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm21, %xmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm7, %xmm17
@@ -12395,12 +12230,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm1
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm25, %xmm3
; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4
@@ -12408,12 +12243,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm24, %xmm4
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, (%rsp) # 16-byte Spill
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm19
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm19[0],xmm4[0],xmm19[1],xmm4[1],xmm19[2],xmm4[2],xmm19[3],xmm4[3]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm19 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm18, %xmm30
; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm17, %xmm13
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm30[0],xmm13[1],xmm30[1],xmm13[2],xmm30[2],xmm13[3],xmm30[3]
@@ -12475,13 +12310,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm1
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm3
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm11, %xmm28
; AVX512BW-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4
; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload
@@ -12490,12 +12325,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm24, %xmm13
; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm26, %xmm19
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm19 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm18, %xmm30
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm18, %xmm24
; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload
@@ -12559,24 +12394,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm2
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3
; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm16, %xmm4
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm13
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm13 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm24, %xmm19
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, %xmm26
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -12636,25 +12471,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm1
; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm4
; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm16, %xmm5
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm5
; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm13
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm13 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm26, %xmm19
; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm18, %xmm30
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3]
@@ -12711,23 +12546,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm28, %xmm2
; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm3
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm5
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm8
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
-; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-SLOW-NEXT: vmovd {{.*#+}} xmm8 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm9
; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
@@ -12832,13 +12667,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm10
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vmovdqa64 368(%rdi), %xmm21
; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm21, %xmm2
; AVX512BW-FAST-NEXT: vmovdqa 352(%rdi), %xmm4
; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm3
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm23 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm23 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vmovdqa 336(%rdi), %xmm12
; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm12, %xmm5
; AVX512BW-FAST-NEXT: vmovdqa64 320(%rdi), %xmm28
@@ -12899,12 +12734,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm15
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vmovdqa64 %xmm21, %xmm5
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm21, %xmm15
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm23
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm15[0],xmm23[1],xmm15[1],xmm23[2],xmm15[2],xmm23[3],xmm15[3]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm23 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm23 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vmovdqa %xmm12, %xmm7
; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm12, %xmm25
; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm28, %xmm20
@@ -12949,12 +12784,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm12
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12
; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm13
; AVX512BW-FAST-NEXT: vmovdqa %xmm4, %xmm1
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm15
; AVX512BW-FAST-NEXT: vmovdqa64 %xmm7, %xmm23
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm28, %xmm20
@@ -12997,13 +12832,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm14, %ymm11
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vmovdqa64 %xmm5, %xmm19
; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm11
; AVX512BW-FAST-NEXT: vmovdqa64 %xmm1, %xmm20
; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm12
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm14
; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm28, %xmm15
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
@@ -13050,13 +12885,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm12
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm19, %xmm12
; AVX512BW-FAST-NEXT: vmovdqa64 %xmm19, %xmm16
; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm20, %xmm15
; AVX512BW-FAST-NEXT: vmovdqa64 %xmm20, %xmm17
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm20 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm20 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm23, %xmm15
; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm28, %xmm29
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm29[0],xmm15[0],xmm29[1],xmm15[1],xmm29[2],xmm15[2],xmm29[3],xmm15[3]
@@ -13100,11 +12935,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm12
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm16, %xmm12
; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm17, %xmm13
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm23, %xmm20
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm28, %xmm27
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm27[0],xmm20[0],xmm27[1],xmm20[1],xmm27[2],xmm20[2],xmm27[3],xmm20[3]
@@ -13145,11 +12980,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm25, %ymm14, %ymm9
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5],ymm9[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm16, %xmm9
; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm17, %xmm11
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm23, %xmm12
; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm28, %xmm13
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
@@ -13187,11 +13022,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm14, %ymm2
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm16, %xmm2
; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm17, %xmm4
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm23, %xmm9
; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm28, %xmm11
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
@@ -13262,6 +13097,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2: {{.*}}
; AVX512-FAST: {{.*}}
; AVX512-SLOW: {{.*}}
+; AVX512BW: {{.*}}
; AVX512BW-ONLY: {{.*}}
; AVX512BW-ONLY-FAST: {{.*}}
; AVX512BW-ONLY-SLOW: {{.*}}
@@ -13271,6 +13107,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQBW-FAST: {{.*}}
; AVX512DQBW-ONLY: {{.*}}
; AVX512DQBW-SLOW: {{.*}}
+; AVX512F: {{.*}}
; AVX512F-ONLY: {{.*}}
; AVX512F-ONLY-FAST: {{.*}}
; AVX512F-ONLY-SLOW: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index c4bf0c3630e835b..9ceb3a250bc418f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -342,7 +342,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-SLOW-NEXT: shrq $48, %rax
; AVX2-SLOW-NEXT: vmovd %eax, %xmm1
; AVX2-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
; AVX2-SLOW-NEXT: vmovq %xmm0, 32(%r9)
; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9)
@@ -374,7 +374,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FAST-NEXT: shrq $48, %rax
; AVX2-FAST-NEXT: vmovd %eax, %xmm1
; AVX2-FAST-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
; AVX2-FAST-NEXT: vmovq %xmm0, 32(%r9)
; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9)
@@ -406,7 +406,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FAST-PERLANE-NEXT: shrq $48, %rax
; AVX2-FAST-PERLANE-NEXT: vmovd %eax, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
+; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, 32(%r9)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 482da013d741be4..3aab872fa4a91df 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -475,8 +475,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[u,u,u,u,7,15],zero,xmm0[u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[6,u,u,u,u],zero,zero,xmm3[7,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,0,255,255,255,255,0,0,0]
-; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[0,8,u,u,u],zero,zero,xmm2[1,9,u,u,u],zero,zero
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[0,8],zero,zero,xmm1[u,u,u,1,9],zero,zero,xmm1[u,u,u,2,10]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index e60034f7d6b75f0..dcafb5267906387 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -187,13 +187,11 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1]
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [2,6,10,14,3,7,11,15,2,6,10,14,3,7,11,15]
-; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [2,6,10,14,3,7,11,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,4,8,12,1,5,9,13,0,4,8,12,1,5,9,13]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,4,8,12,1,5,9,13,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -361,47 +359,39 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,2,10,0,0,3,11,0,0,2,10,0,0,3,11]
-; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = [0,0,2,10,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [2,10,0,0,3,11,0,0,2,10,0,0,3,11,0,0]
-; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [2,10,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,8,0,0,1,9,0,0,0,8,0,0,1,9]
-; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,8,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,8,0,0,1,9,0,0,0,8,0,0,1,9,0,0]
-; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [0,8,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,6,14,0,0,7,15,0,0,6,14,0,0,7,15]
-; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [0,0,6,14,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [6,14,0,0,7,15,0,0,6,14,0,0,7,15,0,0]
-; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [6,14,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,4,12,0,0,5,13,0,0,4,12,0,0,5,13]
-; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [0,0,4,12,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [4,12,0,0,5,13,0,0,4,12,0,0,5,13,0,0]
-; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [4,12,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
index 122b478577fbfee..5d02bb8b05f1813 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -96,7 +96,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
;
; SSSE3-LABEL: testv2i64:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: movdqa %xmm2, %xmm3
; SSSE3-NEXT: pshufb %xmm0, %xmm3
; SSSE3-NEXT: movdqa %xmm0, %xmm1
@@ -128,7 +128,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
;
; SSE41-LABEL: testv2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: pshufb %xmm0, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm1
@@ -160,7 +160,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
;
; AVX1OR2-LABEL: testv2i64:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -188,7 +188,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
;
; AVX512VL-LABEL: testv2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -216,7 +216,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv2i64:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -257,7 +257,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
;
; X86-SSE-LABEL: testv2i64:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm3, %xmm4
; X86-SSE-NEXT: pshufb %xmm0, %xmm4
; X86-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -374,7 +374,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
;
; SSSE3-LABEL: testv2i64u:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: movdqa %xmm2, %xmm3
; SSSE3-NEXT: pshufb %xmm0, %xmm3
; SSSE3-NEXT: movdqa %xmm0, %xmm1
@@ -406,7 +406,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
;
; SSE41-LABEL: testv2i64u:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: pshufb %xmm0, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm1
@@ -438,7 +438,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
;
; AVX1OR2-LABEL: testv2i64u:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -466,7 +466,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
;
; AVX512VL-LABEL: testv2i64u:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -494,7 +494,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv2i64u:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -535,7 +535,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
;
; X86-SSE-LABEL: testv2i64u:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm3, %xmm4
; X86-SSE-NEXT: pshufb %xmm0, %xmm4
; X86-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -656,7 +656,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
;
; SSSE3-LABEL: testv4i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pshufb %xmm0, %xmm2
; SSSE3-NEXT: movdqa %xmm0, %xmm3
@@ -682,7 +682,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
;
; SSE41-LABEL: testv4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: pshufb %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm3
@@ -708,7 +708,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
;
; AVX1OR2-LABEL: testv4i32:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -731,7 +731,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
;
; AVX512VL-LABEL: testv4i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -754,7 +754,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv4i32:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -790,7 +790,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
;
; X86-SSE-LABEL: testv4i32:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm3, %xmm4
; X86-SSE-NEXT: pshufb %xmm0, %xmm4
; X86-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -905,7 +905,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
;
; SSSE3-LABEL: testv4i32u:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pshufb %xmm0, %xmm2
; SSSE3-NEXT: movdqa %xmm0, %xmm3
@@ -931,7 +931,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
;
; SSE41-LABEL: testv4i32u:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: pshufb %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm3
@@ -957,7 +957,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
;
; AVX1OR2-LABEL: testv4i32u:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -980,7 +980,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
;
; AVX512VL-LABEL: testv4i32u:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1003,7 +1003,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv4i32u:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1039,7 +1039,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
;
; X86-SSE-LABEL: testv4i32u:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm3, %xmm4
; X86-SSE-NEXT: pshufb %xmm0, %xmm4
; X86-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -1142,7 +1142,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
;
; SSSE3-LABEL: testv8i16:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pshufb %xmm0, %xmm2
; SSSE3-NEXT: movdqa %xmm0, %xmm3
@@ -1162,7 +1162,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
;
; SSE41-LABEL: testv8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: pshufb %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm3
@@ -1182,7 +1182,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
;
; AVX1OR2-LABEL: testv8i16:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -1200,7 +1200,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
;
; AVX512VL-LABEL: testv8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1218,7 +1218,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv8i16:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1254,7 +1254,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
;
; X86-SSE-LABEL: testv8i16:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm2, %xmm3
; X86-SSE-NEXT: pshufb %xmm0, %xmm3
; X86-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -1350,7 +1350,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
;
; SSSE3-LABEL: testv8i16u:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pshufb %xmm0, %xmm2
; SSSE3-NEXT: movdqa %xmm0, %xmm3
@@ -1370,7 +1370,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
;
; SSE41-LABEL: testv8i16u:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: pshufb %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm3
@@ -1390,7 +1390,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
;
; AVX1OR2-LABEL: testv8i16u:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -1408,7 +1408,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
;
; AVX512VL-LABEL: testv8i16u:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1426,7 +1426,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv8i16u:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1462,7 +1462,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
;
; X86-SSE-LABEL: testv8i16u:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm2, %xmm3
; X86-SSE-NEXT: pshufb %xmm0, %xmm3
; X86-SSE-NEXT: movdqa %xmm0, %xmm1
@@ -1552,7 +1552,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
;
; SSSE3-LABEL: testv16i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pshufb %xmm0, %xmm2
; SSSE3-NEXT: psrlw $4, %xmm0
@@ -1567,7 +1567,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
;
; SSE41-LABEL: testv16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: pshufb %xmm0, %xmm2
; SSE41-NEXT: psrlw $4, %xmm0
@@ -1582,7 +1582,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
;
; AVX1OR2-LABEL: testv16i8:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1595,7 +1595,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
;
; AVX512VL-LABEL: testv16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
@@ -1608,7 +1608,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv16i8:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
@@ -1630,7 +1630,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
;
; X86-SSE-LABEL: testv16i8:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm1, %xmm2
; X86-SSE-NEXT: pshufb %xmm0, %xmm2
; X86-SSE-NEXT: psrlw $4, %xmm0
@@ -1715,7 +1715,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
;
; SSSE3-LABEL: testv16i8u:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pshufb %xmm0, %xmm2
; SSSE3-NEXT: psrlw $4, %xmm0
@@ -1730,7 +1730,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
;
; SSE41-LABEL: testv16i8u:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: pshufb %xmm0, %xmm2
; SSE41-NEXT: psrlw $4, %xmm0
@@ -1745,7 +1745,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
;
; AVX1OR2-LABEL: testv16i8u:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1758,7 +1758,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
;
; AVX512VL-LABEL: testv16i8u:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
@@ -1771,7 +1771,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv16i8u:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
@@ -1793,7 +1793,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
;
; X86-SSE-LABEL: testv16i8u:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-SSE-NEXT: movdqa %xmm1, %xmm2
; X86-SSE-NEXT: pshufb %xmm0, %xmm2
; X86-SSE-NEXT: psrlw $4, %xmm0
@@ -1812,22 +1812,22 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
define <2 x i64> @foldv2i64() nounwind {
; SSE-LABEL: foldv2i64:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0]
+; SSE-NEXT: movss {{.*#+}} xmm0 = [55,0,0,0]
; SSE-NEXT: retq
;
; NOBW-LABEL: foldv2i64:
; NOBW: # %bb.0:
-; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0]
+; NOBW-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0]
; NOBW-NEXT: retq
;
; AVX512VLBWDQ-LABEL: foldv2i64:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0]
; AVX512VLBWDQ-NEXT: retq
;
; X86-SSE-LABEL: foldv2i64:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0]
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = [55,0,0,0]
; X86-SSE-NEXT: retl
%out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
ret <2 x i64> %out
@@ -1836,22 +1836,22 @@ define <2 x i64> @foldv2i64() nounwind {
define <2 x i64> @foldv2i64u() nounwind {
; SSE-LABEL: foldv2i64u:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0]
+; SSE-NEXT: movss {{.*#+}} xmm0 = [55,0,0,0]
; SSE-NEXT: retq
;
; NOBW-LABEL: foldv2i64u:
; NOBW: # %bb.0:
-; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0]
+; NOBW-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0]
; NOBW-NEXT: retq
;
; AVX512VLBWDQ-LABEL: foldv2i64u:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0]
; AVX512VLBWDQ-NEXT: retq
;
; X86-SSE-LABEL: foldv2i64u:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0]
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = [55,0,0,0]
; X86-SSE-NEXT: retl
%out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
ret <2 x i64> %out
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
index fe6836c045f3beb..8a0d9a6134cea5e 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
@@ -13,7 +13,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-LABEL: testv4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -199,7 +199,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX1-LABEL: testv4i64u:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -385,7 +385,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-LABEL: testv8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -541,7 +541,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX1-LABEL: testv8i32u:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -697,7 +697,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX1-LABEL: testv16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -818,7 +818,7 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX1-LABEL: testv16i16u:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -939,7 +939,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX1-LABEL: testv32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -1035,7 +1035,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX1-LABEL: testv32i8u:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
index 82afa15079182f6..b233855029c582d 100644
--- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
+++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
@@ -13,7 +13,7 @@ declare <4 x i16> @llvm.umul.fix.sat.v4i16(<4 x i16>, <4 x i16>, i32 immarg)
define <4 x i16> @smulfix(<4 x i16> %a) {
; CHECK-LABEL: smulfix:
; CHECK: # %bb.0:
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,2,3,4,u,u,u,u]
+; CHECK-NEXT: movq {{.*#+}} xmm1 = [1,2,3,4,0,0,0,0]
; CHECK-NEXT: movdqa %xmm0, %xmm2
; CHECK-NEXT: pmullw %xmm1, %xmm2
; CHECK-NEXT: psrlw $15, %xmm2
@@ -28,7 +28,7 @@ define <4 x i16> @smulfix(<4 x i16> %a) {
define <4 x i16> @umulfix(<4 x i16> %a) {
; CHECK-LABEL: umulfix:
; CHECK: # %bb.0:
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,2,3,4,u,u,u,u]
+; CHECK-NEXT: movq {{.*#+}} xmm1 = [1,2,3,4,0,0,0,0]
; CHECK-NEXT: movdqa %xmm0, %xmm2
; CHECK-NEXT: pmullw %xmm1, %xmm2
; CHECK-NEXT: psrlw $15, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index dbdc3e09fcef081..8056b9a2963c310 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -864,7 +864,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
;
; SSE41-LABEL: test_v4i16_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,32768,16384,8192,u,u,u,u]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [0,32768,16384,8192,0,0,0,0]
; SSE41-NEXT: pmulhuw %xmm0, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
@@ -914,7 +914,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
; AVX512BW-LABEL: test_v4i16_v4i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index 43c9be2dc6f9769..ad810c092bf55ee 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -719,7 +719,7 @@ define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; X86-SSE2-LABEL: splatvar_rotate_v2i64:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,0,0,0]
+; X86-SSE2-NEXT: movd {{.*#+}} xmm2 = [64,0,0,0]
; X86-SSE2-NEXT: psubq %xmm1, %xmm2
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
; X86-SSE2-NEXT: psllq %xmm1, %xmm3
@@ -815,7 +815,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; SSE41-LABEL: splatvar_rotate_v8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,0,0,0]
+; SSE41-NEXT: movd {{.*#+}} xmm2 = [15,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm3
; SSE41-NEXT: pandn %xmm2, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm4
@@ -828,7 +828,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX-LABEL: splatvar_rotate_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
@@ -839,7 +839,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512F-LABEL: splatvar_rotate_v8i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512F-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512F-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
@@ -850,7 +850,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512VL-LABEL: splatvar_rotate_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512VL-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
@@ -861,7 +861,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512BW-LABEL: splatvar_rotate_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512BW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
@@ -872,7 +872,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512VLBW-LABEL: splatvar_rotate_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index c55335f84956979..dae2cf382b8205a 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -649,7 +649,7 @@ define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-LABEL: splatvar_rotate_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index e15be224bec8cd5..ff41d883380a860 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -1786,7 +1786,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
;
; SSE41-LABEL: constant_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,u,16384,8192,u,u,u,u]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [0,0,16384,8192,0,0,0,0]
; SSE41-NEXT: pmulhw %xmm0, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE41-NEXT: psraw $1, %xmm0
@@ -1818,7 +1818,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v4i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1896,7 +1896,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v2i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index f340615464cfa75..71719e03c7c6dcb 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -1486,7 +1486,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
;
; SSE41-LABEL: constant_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,32768,16384,8192,u,u,u,u]
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [0,32768,16384,8192,0,0,0,0]
; SSE41-NEXT: pmulhuw %xmm0, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE41-NEXT: retq
@@ -1511,7 +1511,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v4i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1581,7 +1581,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v2i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
index 4d4642b18878ebe..19645fd08c9468c 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
@@ -1339,7 +1339,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v4i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1399,7 +1399,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v2i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
index b40be9452ddd77a..e298091bfb983ff 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -1003,7 +1003,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
;
; SSSE3-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: movq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: pshufb %xmm2, %xmm1
; SSSE3-NEXT: pshufb %xmm2, %xmm0
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1011,7 +1011,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
;
; SSE41-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: movq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSE41-NEXT: pshufb %xmm2, %xmm1
; SSE41-NEXT: pshufb %xmm2, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1019,8 +1019,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
;
; AVX1-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1028,7 +1027,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
;
; AVX2-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1036,7 +1035,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
;
; AVX512VLBW-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX512VLBW-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 5b4f15b51ec00ae..bac63f2ddb50598 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -539,7 +539,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,12,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,12,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
@@ -580,7 +580,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,13,0,0,0,0,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [0,0,13,0,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
@@ -621,7 +621,7 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,14,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm1 = [0,14,0,0,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
@@ -662,7 +662,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,0,0,0]
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm1 = [15,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index baa942794ccd8a0..df79e5bc75695fd 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -1281,7 +1281,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1328,7 +1328,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1375,7 +1375,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1422,7 +1422,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1469,7 +1469,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1516,7 +1516,7 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1563,7 +1563,7 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1610,7 +1610,7 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [31,0,0,0]
+; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [31,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -2565,12 +2565,10 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_
; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [15,14,13,12,11,10,9,8,15,14,13,12,11,10,9,8]
-; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm3 = [15,14,13,12,11,10,9,8,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0]
-; AVX1-NEXT: # xmm5 = mem[0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
@@ -2771,7 +2769,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2785,7 +2783,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_
; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; XOPAVX1-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2803,7 +2801,7 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2817,7 +2815,7 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; XOPAVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; XOPAVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2835,7 +2833,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2849,7 +2847,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; XOPAVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; XOPAVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -5080,7 +5078,7 @@ define <32 x i8> @PR55066(<32 x i8> %a0) {
; AVX1-LABEL: PR55066:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index a69a1a18f26e507..02cad49d29fdb2f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -208,7 +208,7 @@ define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8f32_06000000:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,0,0]
+; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm1 = [0,6,0,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -225,7 +225,7 @@ define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8f32_70000000:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0]
+; AVX2OR512VL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -1808,7 +1808,7 @@ define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8i32_06000000:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,0,0]
+; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm1 = [0,6,0,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -1825,7 +1825,7 @@ define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8i32_70000000:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0]
+; AVX2OR512VL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index fa0ec33bf34080e..24b1b42c2dc058a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -199,13 +199,13 @@ define <32 x i16> @shuffle_v32i16_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_1
define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; KNL: ## %bb.0:
-; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; KNL-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; SKX: ## %bb.0:
-; SKX-NEXT: vmovaps {{.*#+}} xmm1 = [65535,0,0,0]
+; SKX-NEXT: vmovss {{.*#+}} xmm1 = [65535,0,0,0]
; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 4df5307316a4269..f4cc8522adec56d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -109,25 +109,25 @@ define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512DQ-NEXT: vmovss {{.*#+}} xmm1 = [255,0,0,0]
; AVX512DQ-NEXT: vandps %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512VBMI-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512VBMI-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512VBMI-NEXT: retq
%shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 9bfca824fb71a97..008593a239f869e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -142,7 +142,7 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_70000000:
; ALL: # %bb.0:
-; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0]
+; ALL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0]
; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; ALL-NEXT: ret{{[l|q]}}
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -960,7 +960,7 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_70000000:
; ALL: # %bb.0:
-; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0]
+; ALL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0]
; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; ALL-NEXT: ret{{[l|q]}}
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 1e2ee7b99a608d7..d285d07e66049ca 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -1744,26 +1744,18 @@ define <4 x i8> @combine_test1c(ptr %a, ptr %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: movss {{.*#+}} xmm0 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: combine_test1c:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_test1c:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
-; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: combine_test1c:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%A = load <4 x i8>, ptr %a
%B = load <4 x i8>, ptr %b
%1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -1838,26 +1830,18 @@ define <4 x i8> @combine_test4c(ptr %a, ptr %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: movss {{.*#+}} xmm0 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: combine_test4c:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_test4c:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255]
-; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: combine_test4c:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%A = load <4 x i8>, ptr %a
%B = load <4 x i8>, ptr %b
%1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
@@ -3187,7 +3171,7 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
define void @PR43024() {
; SSE2-LABEL: PR43024:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; SSE2-NEXT: movaps %xmm0, (%rax)
; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: xorps %xmm1, %xmm1
@@ -3198,7 +3182,7 @@ define void @PR43024() {
;
; SSSE3-LABEL: PR43024:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; SSSE3-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; SSSE3-NEXT: movaps %xmm0, (%rax)
; SSSE3-NEXT: addss %xmm0, %xmm0
; SSSE3-NEXT: xorps %xmm1, %xmm1
@@ -3209,7 +3193,7 @@ define void @PR43024() {
;
; SSE41-LABEL: PR43024:
; SSE41: # %bb.0:
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; SSE41-NEXT: movaps %xmm0, (%rax)
; SSE41-NEXT: addss %xmm0, %xmm0
; SSE41-NEXT: xorps %xmm1, %xmm1
@@ -3220,7 +3204,7 @@ define void @PR43024() {
;
; AVX-LABEL: PR43024:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; AVX-NEXT: vmovaps %xmm0, (%rax)
; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
index 95a57d1bdf3318a..aca50c461a7a193 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -42,7 +42,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = [18446744073709551615,0]
; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
@@ -56,7 +56,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z}
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551615,0]
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm2 = [18446744073709551615,0]
; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
@@ -67,7 +67,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
-; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0]
+; VL_BW_DQ-NEXT: vmovq {{.*#+}} xmm1 = [18446744073709551615,0]
; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index f8e3a7a23056fb4..3294c7ffee40d22 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -516,7 +516,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
; AVX1-LABEL: trunc8i32_8i8:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -527,7 +527,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
; AVX2-LABEL: trunc8i32_8i8:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
index 3d5947d8e59bd44..3dc43031cea9efd 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -1762,37 +1762,37 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
define <2 x i64> @foldv2i64() nounwind {
; SSE-LABEL: foldv2i64:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0]
+; SSE-NEXT: movss {{.*#+}} xmm0 = [8,0,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: foldv2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; AVX-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: foldv2i64:
; AVX512VPOPCNTDQ: # %bb.0:
-; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX512VPOPCNTDQ-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; AVX512VPOPCNTDQ-NEXT: retq
;
; AVX512VPOPCNTDQVL-LABEL: foldv2i64:
; AVX512VPOPCNTDQVL: # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX512VPOPCNTDQVL-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; AVX512VPOPCNTDQVL-NEXT: retq
;
; BITALG_NOVLX-LABEL: foldv2i64:
; BITALG_NOVLX: # %bb.0:
-; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; BITALG_NOVLX-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: foldv2i64:
; BITALG: # %bb.0:
-; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; BITALG-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; BITALG-NEXT: retq
;
; X86-SSE-LABEL: foldv2i64:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0]
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = [8,0,0,0]
; X86-SSE-NEXT: retl
%out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
ret <2 x i64> %out
@@ -1801,37 +1801,37 @@ define <2 x i64> @foldv2i64() nounwind {
define <2 x i64> @foldv2i64u() nounwind {
; SSE-LABEL: foldv2i64u:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0]
+; SSE-NEXT: movss {{.*#+}} xmm0 = [8,0,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: foldv2i64u:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; AVX-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: foldv2i64u:
; AVX512VPOPCNTDQ: # %bb.0:
-; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX512VPOPCNTDQ-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; AVX512VPOPCNTDQ-NEXT: retq
;
; AVX512VPOPCNTDQVL-LABEL: foldv2i64u:
; AVX512VPOPCNTDQVL: # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX512VPOPCNTDQVL-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; AVX512VPOPCNTDQVL-NEXT: retq
;
; BITALG_NOVLX-LABEL: foldv2i64u:
; BITALG_NOVLX: # %bb.0:
-; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; BITALG_NOVLX-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: foldv2i64u:
; BITALG: # %bb.0:
-; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; BITALG-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0]
; BITALG-NEXT: retq
;
; X86-SSE-LABEL: foldv2i64u:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0]
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = [8,0,0,0]
; X86-SSE-NEXT: retl
%out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
ret <2 x i64> %out
diff --git a/llvm/test/CodeGen/X86/vselect-constants.ll b/llvm/test/CodeGen/X86/vselect-constants.ll
index 0630a40b88099e0..050b3329a4abb61 100644
--- a/llvm/test/CodeGen/X86/vselect-constants.ll
+++ b/llvm/test/CodeGen/X86/vselect-constants.ll
@@ -281,7 +281,7 @@ define i32 @wrong_min_signbits(<2 x i16> %x) {
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: pcmpeqw %xmm0, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [1,0,0,0]
+; SSE-NEXT: movd {{.*#+}} xmm0 = [1,0,0,0]
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE-NEXT: por %xmm0, %xmm1
@@ -292,7 +292,7 @@ define i32 @wrong_min_signbits(<2 x i16> %x) {
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,0,0,0]
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = [2,0,0,0]
; AVX-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vselect-post-combine.ll b/llvm/test/CodeGen/X86/vselect-post-combine.ll
index e91b8d029bcb4af..474f70f78937eb4 100644
--- a/llvm/test/CodeGen/X86/vselect-post-combine.ll
+++ b/llvm/test/CodeGen/X86/vselect-post-combine.ll
@@ -4,7 +4,7 @@
define ptr @test_mul(ptr %addr) {
; AVX2-LABEL: test_mul:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [255,0,0,0]
; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX2-NEXT: vpblendvb %xmm0, (%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
diff --git a/llvm/test/CodeGen/X86/widen_bitcnt.ll b/llvm/test/CodeGen/X86/widen_bitcnt.ll
index 0f121d88b3573d1..18cb5e1b86ec315 100644
--- a/llvm/test/CodeGen/X86/widen_bitcnt.ll
+++ b/llvm/test/CodeGen/X86/widen_bitcnt.ll
@@ -345,7 +345,7 @@ define <8 x i32> @widen_ctpop_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32
define <4 x i32> @widen_ctlz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
; SSE42-LABEL: widen_ctlz_v2i32_v4i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE42-NEXT: movdqa %xmm3, %xmm6
; SSE42-NEXT: pshufb %xmm0, %xmm6
; SSE42-NEXT: movdqa %xmm0, %xmm5
@@ -394,7 +394,7 @@ define <4 x i32> @widen_ctlz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
;
; AVX2-LABEL: widen_ctlz_v2i32_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3
; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm4
; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -448,7 +448,7 @@ define <4 x i32> @widen_ctlz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
define <8 x i32> @widen_ctlz_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) {
; SSE42-LABEL: widen_ctlz_v4i32_v8i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE42-NEXT: movdqa %xmm3, %xmm6
; SSE42-NEXT: pshufb %xmm0, %xmm6
; SSE42-NEXT: movdqa %xmm0, %xmm5
@@ -535,7 +535,7 @@ define <8 x i32> @widen_ctlz_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) {
define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> %a2, <2 x i32> %a3) {
; SSE42-LABEL: widen_ctlz_v2i32_v8i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT: movq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE42-NEXT: movdqa %xmm5, %xmm8
; SSE42-NEXT: pshufb %xmm0, %xmm8
; SSE42-NEXT: movdqa %xmm0, %xmm7
@@ -629,7 +629,7 @@ define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>
; AVX2: # %bb.0:
; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vmovq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm4
; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm6
; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -724,7 +724,7 @@ define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>
define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
; SSE42-LABEL: widen_ctlz_undef_v2i32_v4i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE42-NEXT: movdqa %xmm3, %xmm6
; SSE42-NEXT: pshufb %xmm0, %xmm6
; SSE42-NEXT: movdqa %xmm0, %xmm5
@@ -773,7 +773,7 @@ define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
;
; AVX2-LABEL: widen_ctlz_undef_v2i32_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3
; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm4
; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -827,7 +827,7 @@ define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
define <8 x i32> @widen_ctlz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) {
; SSE42-LABEL: widen_ctlz_undef_v4i32_v8i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE42-NEXT: movdqa %xmm3, %xmm6
; SSE42-NEXT: pshufb %xmm0, %xmm6
; SSE42-NEXT: movdqa %xmm0, %xmm5
@@ -914,7 +914,7 @@ define <8 x i32> @widen_ctlz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) {
define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> %a2, <2 x i32> %a3) {
; SSE42-LABEL: widen_ctlz_undef_v2i32_v8i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT: movq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE42-NEXT: movdqa %xmm5, %xmm8
; SSE42-NEXT: pshufb %xmm0, %xmm8
; SSE42-NEXT: movdqa %xmm0, %xmm7
@@ -1008,7 +1008,7 @@ define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2
; AVX2: # %bb.0:
; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vmovq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm4
; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm6
; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 742a3a0c66a2ac6..a58d28695caeaf2 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -436,7 +436,7 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(ptr %ptr) nounwind {
define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
; AVX1-LABEL: interleaved_load_vf16_i8_stride4:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,4,8,12,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm3
@@ -444,35 +444,35 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm5
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,1,5,9,13,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6
; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,2,6,10,14,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6
; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-NEXT: vmovd {{.*#+}} xmm6 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,3,7,11,15,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-NEXT: vmovd {{.*#+}} xmm4 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -488,39 +488,39 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,4,8,12,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,1,5,9,13,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
; AVX2-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,2,6,10,14,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,3,7,11,15,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -563,7 +563,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX1-LABEL: interleaved_load_vf32_i8_stride4:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,0,4,8,12,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
@@ -571,7 +571,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm4
; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm5
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT: vmovd {{.*#+}} xmm8 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm5
; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm7
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
@@ -587,11 +587,11 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm8
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1,2,3],xmm10[4,5,6,7]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,1,5,9,13,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm8
; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm12
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-NEXT: vmovd {{.*#+}} xmm12 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13
; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
@@ -605,11 +605,11 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7]
; AVX1-NEXT: vpcmpeqb %xmm9, %xmm10, %xmm9
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,2,6,10,14,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm10, %xmm3, %xmm11
; AVX1-NEXT: vpshufb %xmm10, %xmm2, %xmm12
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-NEXT: vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13
; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
@@ -621,11 +621,11 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpshufb %xmm12, %xmm7, %xmm12
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4,5,6,7]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,3,7,11,15,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm3
; AVX1-NEXT: vpshufb %xmm12, %xmm2, %xmm2
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -653,7 +653,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm5
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,0,4,8,12,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7
; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm6
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
@@ -669,7 +669,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7
; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,1,5,9,13,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9
; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
@@ -685,7 +685,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
; AVX2-NEXT: vpcmpeqb %ymm7, %ymm8, %ymm7
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,2,6,10,14,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9
; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
@@ -700,7 +700,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,3,7,11,15,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5
; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
index 023ac96181d3cc6..8c9dc90d2a71da6 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
@@ -1686,7 +1686,7 @@ define void @vec256_v32i8_to_v2i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: paddb (%rsi), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -1783,7 +1783,7 @@ define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1794,7 +1794,7 @@ define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1805,7 +1805,7 @@ define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -1997,7 +1997,7 @@ define void @vec256_v16i16_to_v2i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -2126,7 +2126,7 @@ define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
@@ -2137,7 +2137,7 @@ define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -2148,7 +2148,7 @@ define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -3360,7 +3360,7 @@ define void @vec384_v48i8_to_v3i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: paddb (%rsi), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
@@ -3483,7 +3483,7 @@ define void @vec384_v48i8_to_v2i192_factor24(ptr %in.vec.base.ptr, ptr %in.vec.b
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -3609,7 +3609,7 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -3622,7 +3622,7 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
@@ -3635,7 +3635,7 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -4261,7 +4261,7 @@ define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
@@ -4442,7 +4442,7 @@ define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec.
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -4589,7 +4589,7 @@ define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -4602,7 +4602,7 @@ define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
@@ -5998,7 +5998,7 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: paddb (%rsi), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
@@ -6182,7 +6182,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: paddb (%rsi), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -6217,7 +6217,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
@@ -6231,7 +6231,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
@@ -6245,7 +6245,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm1
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -6297,7 +6297,7 @@ define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -6310,7 +6310,7 @@ define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
@@ -6323,7 +6323,7 @@ define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -6573,7 +6573,7 @@ define void @vec512_v32i16_to_v4i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
@@ -6755,7 +6755,7 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -6809,7 +6809,7 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
@@ -6823,7 +6823,7 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm1
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
@@ -6837,7 +6837,7 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512BW-NEXT: vpand %ymm0, %ymm1, %ymm1
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -6908,7 +6908,7 @@ define void @vec512_v32i16_to_v1i512_factor32(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -6921,7 +6921,7 @@ define void @vec512_v32i16_to_v1i512_factor32(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index b4c79aff5cef1f2..8ab53140eb91107 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -1053,7 +1053,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; SSE42-NEXT: paddb 48(%rsi), %xmm2
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 32(%rsi), %xmm1
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSE42-NEXT: pshufb %xmm3, %xmm1
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -1075,8 +1075,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX-NEXT: # xmm3 = mem[0,0]
+; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -6055,7 +6054,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -6068,7 +6067,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -6081,7 +6080,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -6520,7 +6519,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -6533,7 +6532,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -6546,7 +6545,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index 23f02e9245eedad..c362bdaa3217d04 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -875,7 +875,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 32(%rdi), %xmm1
; SSE42-NEXT: movdqa 48(%rdi), %xmm2
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSE42-NEXT: pshufb %xmm3, %xmm1
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -894,8 +894,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX-NEXT: # xmm3 = mem[0,0]
+; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -4884,7 +4883,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
;
; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [255,0,0,0]
; AVX2-NEXT: vpand (%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
@@ -4895,7 +4894,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
;
; AVX512F-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = [255,0,0,0]
; AVX512F-NEXT: vpand (%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
@@ -4906,7 +4905,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
;
; AVX512DQ-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0]
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm0 = [255,0,0,0]
; AVX512DQ-NEXT: vpand (%rdi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
@@ -5291,7 +5290,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
;
; AVX2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0]
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [65535,0,0,0]
; AVX2-NEXT: vpand (%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
@@ -5302,7 +5301,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
;
; AVX512F-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0]
+; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = [65535,0,0,0]
; AVX512F-NEXT: vpand (%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
@@ -5313,7 +5312,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
;
; AVX512DQ-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0]
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm0 = [65535,0,0,0]
; AVX512DQ-NEXT: vpand (%rdi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
More information about the llvm-commits
mailing list